R Markdown file

This is an R Markdown document used to represent exploratory data analysis on the loan data provided by https://www.lendingclub.com/info/download-data.action

Reading & Cleaning the raw csv data

library(readr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
#Reading the Loan Data csv file as data.frame
LoanStats <- read_csv("C:/Users/Bhuvan/Desktop/Variable_Selection_Project/LoanStats.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   loan_amnt = col_integer(),
##   funded_amnt = col_integer(),
##   funded_amnt_inv = col_double(),
##   installment = col_double(),
##   annual_inc = col_double(),
##   dti = col_double(),
##   delinq_2yrs = col_integer(),
##   inq_last_6mths = col_integer(),
##   mths_since_last_delinq = col_integer(),
##   mths_since_last_record = col_integer(),
##   open_acc = col_integer(),
##   pub_rec = col_integer(),
##   revol_bal = col_integer(),
##   total_acc = col_integer(),
##   out_prncp = col_integer(),
##   out_prncp_inv = col_integer(),
##   total_pymnt = col_double(),
##   total_pymnt_inv = col_double(),
##   total_rec_prncp = col_double(),
##   total_rec_int = col_double()
##   # ... with 11 more columns
## )
## See spec(...) for full column specifications.
nrow(LoanStats)
## [1] 42535
ncol(LoanStats)
## [1] 122
#Removing the columns with no data
LoanStats1 <- LoanStats[,!apply(LoanStats, 2, function(x) 
  all(gsub(" ", "", x)=="", na.rm=TRUE))]
nrow(LoanStats1)
## [1] 42535
ncol(LoanStats1)
## [1] 54

Composition of all the portfolios

#Composition of all the portfolio
library(sqldf)
LoanStats_agg <- sqldf('select purpose, count(*) as counts, 
                       sum(loan_amnt) as total_exposure from 
                       LoanStats1 group by purpose')
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
LoanStats_agg$tot_loan_cnt <- as.numeric(sum(LoanStats_agg$counts))
LoanStats_agg$pct_cnt <- as.numeric(round(LoanStats_agg$counts/LoanStats_agg$tot_loan_cnt*100,2))
LoanStats_agg$tot_loan_amt <- as.numeric(sum(LoanStats_agg$total_exposure))
LoanStats_agg$pct_exposure <- as.numeric(round(LoanStats_agg$total_exposure/LoanStats_agg$tot_loan_amt*100,2))


lbls <- LoanStats_agg$purpose
lbls1 <- paste(lbls, LoanStats_agg$pct_exposure) # add percents to labels 
lbls1 <- paste(lbls1,"%",sep="") # ad % to labels
lbls2 <- paste(lbls, LoanStats_agg$pct_cnt) # add percents to labels 
lbls2 <- paste(lbls2,"%",sep="") # ad % to labels

pie(LoanStats_agg$total_exposure,labels = lbls1, col=rainbow(length(lbls)),
    radius = 1, main="Portfolio wise Total Exposure ",cex=0.75)

pie(LoanStats_agg$counts,labels = lbls2, col=rainbow(length(lbls)),
    radius = 1, main="Portfolio wise Number of loans",cex=0.75)

So as the debt consolidation represented the major portion of the loan data, we have done analysis for only debt consolidation portfolio.

Further cleaning and manipulation of data

ind_dc <- which(LoanStats1$purpose == 'debt_consolidation')
LoanStats2 <- LoanStats1[ind_dc,]

sqldf('select loan_status, count(*) as num_loans from LoanStats2
      group by loan_status')
##                                           loan_status num_loans
## 1                                         Charged Off      2792
## 2 Does not meet the credit policy. Status:Charged Off       292
## 3  Does not meet the credit policy. Status:Fully Paid       808
## 4                                          Fully Paid     15884
#cleaning the loan status variable 
LoanStats2$loan_status_new <- gsub('Does not meet the credit policy. Status:'
                                   ,'',LoanStats2$loan_status)

#converting the interest and revolving utilization rate to number
LoanStats2$int_rate_num <- as.numeric(gsub('%','',LoanStats2$int_rate))
LoanStats2$revol_util_num <- as.numeric(gsub('%','',LoanStats2$revol_util))

#Creating total loan count and total exposure variable
LoanStats2$tot_loan_amt <- as.numeric(sum(LoanStats2$loan_amnt))
LoanStats2$tot_loan_cnt <- as.numeric(nrow(LoanStats2))

Executive Summery of Analysis

Initial Variable Rejection

Initially we had 54 variables and based on our analysis we have rejected n variables.

Below are the some of the rejected variables with our reason.

i> Variables with same unique value throughout. eg policy_code, application_type, pymnt_plan

ii> Variables having free text values & needs extensive text mining. eg desc, title, emp_title

iii> Variables having major portion as blank or unique with minor stake in exposure. eg mths_since_last_record, pub_rec, chargeoff_within_12_mths

pub_rec: Number of derogatory public records

LoanStats2$tot_loan_amt <- as.numeric(sum(LoanStats2$loan_amnt))
LoanStats2$tot_loan_cnt <- as.numeric(nrow(LoanStats2))
library(sqldf)
pub_rec_agg <- sqldf('select pub_rec,
             loan_status_new,count(*) as cnts,
             sum(loan_amnt) as exposure, 
             round(sum(loan_amnt)/tot_loan_amt*100,2) as exp_pct
             from LoanStats2 group by pub_rec,
             loan_status_new')
pub_rec_agg
##   pub_rec loan_status_new  cnts  exposure exp_pct
## 1      NA      Fully Paid     1      1900    0.00
## 2       0     Charged Off  2787  37229325   14.99
## 3       0      Fully Paid 15841 198782825   80.03
## 4       1     Charged Off   294   3481150    1.40
## 5       1      Fully Paid   827   8621100    3.47
## 6       2     Charged Off     3     31950    0.01
## 7       2      Fully Paid    19    189000    0.08
## 8       3      Fully Paid     3     32400    0.01
## 9       4      Fully Paid     1      7200    0.00

chargeoff_within_12_mths: Number of charge-offs within 12 months

LoanStats2$tot_loan_amt <- as.numeric(sum(LoanStats2$loan_amnt))
LoanStats2$tot_loan_cnt <- as.numeric(nrow(LoanStats2))
library(sqldf)
chargeoff_within_12_mths_agg <- sqldf('select chargeoff_within_12_mths,
      loan_status_new,count(*) as cnts,
      sum(loan_amnt) as exposure, 
      round(sum(loan_amnt)/tot_loan_amt*100,2) as exp_pct
      from LoanStats2 group by chargeoff_within_12_mths,
      loan_status_new')
chargeoff_within_12_mths_agg
##   chargeoff_within_12_mths loan_status_new  cnts  exposure exp_pct
## 1                       NA     Charged Off     6     74725    0.03
## 2                       NA      Fully Paid    28    145850    0.06
## 3                        0     Charged Off  3078  40667700   16.37
## 4                        0      Fully Paid 16664 207488575   83.54

Analysing Continous Variables

Loan Amount

summary(LoanStats2$loan_amnt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     700    7000   11000   12560   16500   35000
plot(LoanStats2$loan_amnt, ylab = "Loan Amount")

hist(LoanStats2$loan_amnt, breaks = 20,
     xlab = "Loan Amount", col="#CCCCFF",
     main="Histogram of Loan Amount Distribution")

a <- quantile(LoanStats2$loan_amnt,.90)
b <- quantile(LoanStats2$loan_amnt,0)
ind_la <- which(LoanStats2$loan_amnt < a & LoanStats2$loan_amnt > b)
LoanStats_la <- LoanStats2[ind_la,]
hist(LoanStats_la$loan_amnt, breaks = 10,
     xlab = "Loan Amount", col="#CCCCFF",
     main="Histogram of Loan Amount Distribution")
summary(LoanStats2$loan_amnt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     700    7000   11000   12560   16500   35000
summary(LoanStats_la$loan_amnt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     800    6200   10000   10707   15000   23975
bin_plot <- function(x) {
  
  no_bins <- x
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_la)/no_bins
  unique(cut2(LoanStats_la$loan_amnt, m=no_cases_bin))
  
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]$loan_amnt))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]))
  
  LoanStats_la$loan_amnt_bin <- cut2(LoanStats_la$loan_amnt, m=no_cases_bin)
  
  
  loan_amnt_bin <- sqldf('select loan_amnt_bin, loan_status_new, count(*) as 
                                      cnts, sum(loan_amnt) as exposure,
                                      tot_loan_amt, tot_loan_cnt,
                                      sum(annual_inc) as annual_inc_total,
                                      (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                                      from LoanStats_la group by loan_amnt_bin, loan_status_new')
  
  loan_amnt_bin_tran <- reshape(loan_amnt_bin, idvar = "loan_amnt_bin", 
                                             timevar = "loan_status_new", 
                                             direction = "wide")
  
loan_amnt_bin_tran <-data.frame(loan_amnt_bin=loan_amnt_bin_tran$loan_amnt_bin,
                                 charge_off_Cnts=loan_amnt_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=loan_amnt_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=loan_amnt_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=loan_amnt_bin_tran$`exposure.Fully Paid`,
                 annual_inc_total=loan_amnt_bin_tran$`annual_inc_total.Charged Off`,
                                 tot_loan_amt=loan_amnt_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=loan_amnt_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(loan_amnt_bin_tran$`cnts.Charged Off`/(loan_amnt_bin_tran$`cnts.Charged Off`+
                                                                                         loan_amnt_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(loan_amnt_bin_tran$`exposure.Charged Off`/(loan_amnt_bin_tran$`exposure.Charged Off`+
                                                                                            loan_amnt_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(loan_amnt_bin_tran$`cnts.Charged Off`/loan_amnt_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(loan_amnt_bin_tran$`exposure.Charged Off`/loan_amnt_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- loan_amnt_bin_tran[,c("loan_amnt_bin","pct_cnts_grp")]
  df2 <- loan_amnt_bin_tran[,c("loan_amnt_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=loan_amnt_bin_tran,
                         aes(x=loan_amnt_bin_tran$loan_amnt_bin,
                             y=loan_amnt_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Loan Amount bin", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$loan_amnt_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$loan_amnt_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$loan_amnt_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$loan_amnt_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$loan_amnt_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$loan_amnt_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "Loan Amount bin", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$loan_amnt_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$loan_amnt_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$loan_amnt_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$loan_amnt_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
    retList <- list(p,p3)
  return(retList)
  
}
bin_plot(12)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

## [[1]]

## 
## [[2]]

bin_plot(9)
## [[1]]

## 
## [[2]]

Inference

->Initially in 10 bins we can observe that the charge off rate remains almost same wrt to higher Loan amount bins. -> But as we converge the number of bins, we can observe a pattern, first charge off rate decreases and then after certain point it starts increasingto the maximum. -> We can say that the loans in the lowest and highest loan amount buckets tends to charge off more, that too more in the highest bucket.

The total amount committed to that loan at that point in time.

#The total amount committed to that loan at that point in time.

summary(LoanStats2$funded_amnt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     700    6825   10800   12257   16000   35000
#summary(LoanStats2$loan_amnt)
plot(LoanStats2$funded_amnt, ylab = "Total amount committed")

hist(LoanStats2$funded_amnt, breaks = 20,
     xlab = "The total amount committed", col="#CCCCFF",
     main="Histogram of Total amount committed")

a <- quantile(LoanStats2$funded_amnt,.90)
b <- quantile(LoanStats2$funded_amnt,0)
ind_la <- which(LoanStats2$funded_amnt < a & LoanStats2$funded_amnt > b)
LoanStats_la <- LoanStats2[ind_la,]
hist(LoanStats_la$funded_amnt, breaks = 10,
     xlab = "Total amount committed", col="#CCCCFF",
     main="Histogram of Total amount committed")

summary(LoanStats2$funded_amnt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     700    6825   10800   12257   16000   35000
summary(LoanStats_la$funded_amnt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     800    6025   10000   10565   14800   22725
bin_plot <- function(x) {
  
  no_bins <- x
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_la)/no_bins
  unique(cut2(LoanStats_la$funded_amnt, m=no_cases_bin))
  
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]$funded_amnt))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]))
  
  LoanStats_la$funded_amnt_bin <- cut2(LoanStats_la$funded_amnt, m=no_cases_bin)
  
  
  funded_amnt_bin <- sqldf('select funded_amnt_bin, loan_status_new, count(*) as 
                                      cnts, sum(funded_amnt) as exposure,
                                      tot_loan_amt, tot_loan_cnt,
                                      sum(annual_inc) as annual_inc_total,
                                      (sum(funded_amnt)/tot_loan_amt)*100 as pct_loan 
                                      from LoanStats_la group by funded_amnt_bin, loan_status_new')
  
  funded_amnt_bin_tran <- reshape(funded_amnt_bin, idvar = "funded_amnt_bin", 
                                             timevar = "loan_status_new", 
                                             direction = "wide")
  
funded_amnt_bin_tran <-data.frame(funded_amnt_bin=funded_amnt_bin_tran$funded_amnt_bin,
                                 charge_off_Cnts=funded_amnt_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=funded_amnt_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=funded_amnt_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=funded_amnt_bin_tran$`exposure.Fully Paid`,
                 annual_inc_total=funded_amnt_bin_tran$`annual_inc_total.Charged Off`,
                                 tot_loan_amt=funded_amnt_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=funded_amnt_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(funded_amnt_bin_tran$`cnts.Charged Off`/(funded_amnt_bin_tran$`cnts.Charged Off`+
                                                                                         funded_amnt_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(funded_amnt_bin_tran$`exposure.Charged Off`/(funded_amnt_bin_tran$`exposure.Charged Off`+
                                                                                            funded_amnt_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(funded_amnt_bin_tran$`cnts.Charged Off`/funded_amnt_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(funded_amnt_bin_tran$`exposure.Charged Off`/funded_amnt_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- funded_amnt_bin_tran[,c("funded_amnt_bin","pct_cnts_grp")]
  df2 <- funded_amnt_bin_tran[,c("funded_amnt_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=funded_amnt_bin_tran,
                         aes(x=funded_amnt_bin_tran$funded_amnt_bin,
                             y=funded_amnt_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Funded Amount bin", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$funded_amnt_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$funded_amnt_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$funded_amnt_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$funded_amnt_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$funded_amnt_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$funded_amnt_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "Funded Amount bin", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$funded_amnt_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$funded_amnt_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$funded_amnt_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$funded_amnt_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
    retList <- list(p,p3)
  return(retList)
  
}
bin_plot(12)
## [[1]]

## 
## [[2]]

bin_plot(9)
## [[1]]

## 
## [[2]]

Inference -> Almost similar kind of pattern as loan amount.

The total amount committed by investors for that loan at that point in time.

Inference -> Almost similar kind of pattern as loan amount.

Interest Rate

# Interest Rate Distribution and Analysis
summary(LoanStats2$int_rate_num)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.42   10.00   12.61   12.61   15.21   24.11
plot(LoanStats2$int_rate_num, ylab = "Interest Rate")

hist(LoanStats2$int_rate_num, breaks = 20,
     xlab = "Interest Rate", col="#CCCCFF",
     main="Histogram of Interest Rate Distribution")

library(Hmisc)
# Creating 10 bins for interest rate
no_bins <- 10
no_cases_bin <- nrow(LoanStats2)/no_bins
LoanStats2$tot_loan_amt <- as.numeric(sum(LoanStats2$loan_amnt))
LoanStats2$tot_loan_cnt <- as.numeric(nrow(LoanStats2))
LoanStats2$int_rate_bin <- cut2(LoanStats2$int_rate_num, m=no_cases_bin)

#Aggregating the data interest rate bin wise
intrate_bin <- sqldf('select int_rate_bin, loan_status_new, count(*) as 
                      cnts, sum(loan_amnt) as exposure,
                      tot_loan_amt, tot_loan_cnt,
                      (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                      from LoanStats2 group by int_rate_bin, loan_status_new')


intrate_bin_tran <- reshape(intrate_bin, idvar = "int_rate_bin", 
                             timevar = "loan_status_new", 
                             direction = "wide")

intrate_bin_tran <-data.frame(int_rate_bin=intrate_bin_tran$int_rate_bin,
                                 charge_off_Cnts=intrate_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=intrate_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=intrate_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=intrate_bin_tran$`exposure.Fully Paid`,
                                 tot_loan_amt=intrate_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=intrate_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(intrate_bin_tran$`cnts.Charged Off`/(intrate_bin_tran$`cnts.Charged Off`+
                                                                                         intrate_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(intrate_bin_tran$`exposure.Charged Off`/(intrate_bin_tran$`exposure.Charged Off`+
                                                                                            intrate_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(intrate_bin_tran$`cnts.Charged Off`/intrate_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(intrate_bin_tran$`exposure.Charged Off`/intrate_bin_tran$`tot_loan_amt.Charged Off`)*100)



df1 <- intrate_bin_tran[,c("int_rate_bin","pct_cnts_grp")]
df2 <- intrate_bin_tran[,c("int_rate_bin","pct_cnts_tot")]

p <- ggplot()+geom_bar(data=intrate_bin_tran,
                       aes(x=intrate_bin_tran$int_rate_bin,
                           y=intrate_bin_tran$charge_off_Cnts),
                       stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Interest Rate Bins", y = "")

p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
               axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
  theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))

p <- p + 
  geom_line(data=df1,aes(df1$int_rate_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
  geom_line(data=df2,aes(df2$int_rate_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
  scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                      values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))

p <- p + geom_point(data=df1,aes(df1$int_rate_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
p <- p + geom_point(data=df2,aes(df2$int_rate_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)

p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank())

p

Inference -> Here there is a clear and linear relationship between Interest rates and the charge off rates. -> Rate of charge off increase as we move in higher interest rate bins.

The monthly Installment.

#Installment Distribution and Analysis
summary(LoanStats2$installment)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.22  205.86  327.14  364.14  482.65 1305.19
plot(LoanStats2$installment, ylab = "Installment Amount before")

hist(LoanStats2$installment)

hist(LoanStats2$installment, breaks = 20,
     xlab = "Installment Amount", col="#CCCCFF",
     main="Histogram of Installment Amount Distribution")

a <- quantile(LoanStats2$installment,.95)
b <- quantile(LoanStats2$installment,0)
ind_1 <- which(LoanStats2$installment < a & LoanStats2$installment > b)
LoanStats_ins <- LoanStats2[ind_1,]
plot(LoanStats_ins$installment, ylab = "Installment Amount after")

hist(LoanStats_ins$installment, breaks = 20,
     xlab = "Installment Amount", col="#CCCCFF",
     main="Histogram of Installment Amount Distribution")

#Before
summary(LoanStats2$installment)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.22  205.86  327.14  364.14  482.65 1305.19
#After
summary(LoanStats_ins$installment)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.25  199.80  316.24  335.93  451.21  792.72
bin_plot <- function(x) {



no_bins <- x


library(Hmisc)
no_cases_bin <- nrow(LoanStats_ins)/no_bins
no_cases_bin
unique(cut2(LoanStats_ins$installment, m=no_cases_bin))

LoanStats_ins$tot_loan_amt <- as.numeric(sum(LoanStats_ins$installment))
LoanStats_ins$tot_loan_cnt <- as.numeric(nrow(LoanStats_ins))

LoanStats_ins$install_amt_bin <- cut2(LoanStats_ins$installment, m=no_cases_bin)


install_bin <- sqldf('select install_amt_bin, loan_status_new, count(*) as 
                      cnts, sum(loan_amnt) as exposure,
                     tot_loan_amt, tot_loan_cnt,
                     sum(annual_inc) as annual_inc_total, 
                     (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                     from LoanStats_ins group by install_amt_bin, loan_status_new')

install_bin_tran <- reshape(install_bin, idvar = "install_amt_bin", 
                            timevar = "loan_status_new", 
                            direction = "wide")

install_bin_tran <-data.frame(install_amt_bin=install_bin_tran$install_amt_bin,
                                 charge_off_Cnts=install_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=install_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=install_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=install_bin_tran$`exposure.Fully Paid`,
                                 tot_loan_amt=install_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=install_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(install_bin_tran$`cnts.Charged Off`/(install_bin_tran$`cnts.Charged Off`+
                                                                                         install_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(install_bin_tran$`exposure.Charged Off`/(install_bin_tran$`exposure.Charged Off`+
                                                                                            install_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(install_bin_tran$`cnts.Charged Off`/install_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(install_bin_tran$`exposure.Charged Off`/install_bin_tran$`tot_loan_amt.Charged Off`)*100)


df1 <- install_bin_tran[,c("install_amt_bin","pct_cnts_grp")]
df2 <- install_bin_tran[,c("install_amt_bin","pct_cnts_tot")]

p <- ggplot()+geom_bar(data=install_bin_tran,
                       aes(x=install_bin_tran$install_amt_bin,
                           y=install_bin_tran$charge_off_Cnts),
                       stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Installment Amount Bins", y = "")

p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
               axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
  theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))

p <- p + 
  geom_line(data=df1,aes(df1$install_amt_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
  geom_line(data=df2,aes(df2$install_amt_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
  scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                      values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))

p <- p + geom_point(data=df1,aes(df1$install_amt_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
p <- p + geom_point(data=df2,aes(df2$install_amt_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)

p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank())

p

   return(p)
}
bin_plot(12)

bin_plot(10)

Inference -> Here also no clear relationship is there between installment amount and the charge off rate.

Annual Income provided by the borrower during registration

#Annual Income Distribution and Analysis
summary(LoanStats2$annual_inc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1896   41000   58000   67329   80000 2039784
plot(LoanStats2$annual_inc, ylab = "Annual Income Amount before")

hist(LoanStats2$annual_inc, breaks = 20,
     xlab = "Annual Income", col="#CCCCFF",
     main="Histogram of Annual Income Distribution before")

a <- quantile(LoanStats2$annual_inc,.90)
b <- quantile(LoanStats2$annual_inc,0)
ind_1 <- which(LoanStats2$annual_inc < a & LoanStats2$annual_inc > b)
LoanStats_ins <- LoanStats2[ind_1,]
plot(LoanStats_ins$annual_inc, ylab = "Annual Income Amount after")

hist(LoanStats_ins$annual_inc, breaks = 20,
     xlab = "Loan Amount", col="#CCCCFF",
     main="Histogram of Annual Income Distribution after")

summary(LoanStats2$annual_inc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1896   41000   58000   67329   80000 2039784
summary(LoanStats_ins$annual_inc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4080   40000   54000   56342   72000  109995
bin_plot <- function(x) {
  
  
  
  no_bins <- x
  
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_ins)/no_bins
  unique(cut2(LoanStats_ins$annual_inc, m=no_cases_bin))
  
  LoanStats_ins$tot_loan_amt <- as.numeric(sum(LoanStats_ins$annual_inc))
  LoanStats_ins$tot_loan_cnt <- as.numeric(nrow(LoanStats_ins))
  
  LoanStats_ins$annual_inc_bin <- cut2(LoanStats_ins$annual_inc, m=no_cases_bin)
  
  
  annual_inc_bin <- sqldf('select annual_inc_bin, loan_status_new, count(*) as 
                          cnts, sum(loan_amnt) as exposure,
                          tot_loan_amt, tot_loan_cnt,
                          sum(annual_inc) as annual_inc_total,
                          (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                          from LoanStats_ins group by annual_inc_bin, loan_status_new')
  
  annual_inc_bin_tran <- reshape(annual_inc_bin, idvar = "annual_inc_bin", 
                                 timevar = "loan_status_new", 
                                 direction = "wide")
  
annual_inc_bin_tran <-data.frame(annual_inc_bin=annual_inc_bin_tran$annual_inc_bin,
                                 charge_off_Cnts=annual_inc_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=annual_inc_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=annual_inc_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=annual_inc_bin_tran$`exposure.Fully Paid`,
                                 tot_loan_amt=annual_inc_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=annual_inc_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(annual_inc_bin_tran$`cnts.Charged Off`/(annual_inc_bin_tran$`cnts.Charged Off`+
                                                                                         annual_inc_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(annual_inc_bin_tran$`exposure.Charged Off`/(annual_inc_bin_tran$`exposure.Charged Off`+
                                                                                            annual_inc_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(annual_inc_bin_tran$`cnts.Charged Off`/annual_inc_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(annual_inc_bin_tran$`exposure.Charged Off`/annual_inc_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- annual_inc_bin_tran[,c("annual_inc_bin","pct_cnts_grp")]
  df2 <- annual_inc_bin_tran[,c("annual_inc_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=annual_inc_bin_tran,
                         aes(x=annual_inc_bin_tran$annual_inc_bin,
                             y=annual_inc_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Annual Income Bins", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$annual_inc_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$annual_inc_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$annual_inc_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$annual_inc_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  
    p3 <- ggplot() + 
     geom_line(data=df1,aes(df1$annual_inc_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
     #geom_line(data=df2,aes(df2$annual_inc_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
     scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                         values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
  labs(x = "Annual Income Bins", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                       axis.text.y=element_text(size = 10, face="bold"))

p3 <- p3 + geom_point(data=df1,aes(df1$annual_inc_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
  geom_text(data=df1,aes(df1$annual_inc_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$annual_inc_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
#  geom_text(data=df2,aes(df2$annual_inc_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)

p3 <- p3 + theme(axis.text.y=element_blank(),
       axis.ticks.y=element_blank())

p3

  retList <- list(p,p3)
  return(retList)

}
bin_plot(12)
## [[1]]

## 
## [[2]]

bin_plot(8)
## [[1]]

## 
## [[2]]

Inference -> Initially we observed that rate of charge off was little zig-zag in 10 bins graph. -> But as we tried to reduce the bins it became evident that the rate of charge off decreases as wemove towards highr salary bins. -> So as we expect the individuals with lower salaries tends to charge off more.

Debt to Incom Ratio

#Debt to Incom Ratio Distribution and Analysis
summary(LoanStats2$dti)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.78   14.72   14.55   19.60   29.96
plot(LoanStats2$dti, ylab = "Debt to Incom Ratio before")

hist(LoanStats2$dti, breaks = 30,
     xlab = "Debt to Incom Ratio", col="#CCCCFF",
     main="Histogram of Debt to Incom Ratio Distribution")

a <- quantile(LoanStats2$dti,1)
b <- quantile(LoanStats2$dti,.05)
ind_1 <- which(LoanStats2$dti < a & LoanStats2$dti > b)
LoanStats_ins <- LoanStats2[ind_1,]
plot(LoanStats_ins$dti, ylab = "Debt to Incom Ratio after")

hist(LoanStats_ins$dti, breaks = 40,
     xlab = "Debt to Incom Ratio", col="#CCCCFF",
     main="Histogram of Debt to Incom Ratio Distribution")

summary(LoanStats2$dti)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.78   14.72   14.55   19.60   29.96
summary(LoanStats_ins$dti)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.90   10.62   15.17   15.20   19.84   29.93
bin_plot <- function(x) {
  
  
  
  no_bins <- x
  
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_ins)/no_bins
  unique(cut2(LoanStats_ins$dti, m=no_cases_bin))
  
  LoanStats_ins$tot_loan_amt <- as.numeric(sum(LoanStats_ins$dti))
  LoanStats_ins$tot_loan_cnt <- as.numeric(nrow(LoanStats_ins))
  
  LoanStats_ins$dti_bin <- cut2(LoanStats_ins$dti, m=no_cases_bin)
  
  
  dti_bin <- sqldf('select dti_bin, loan_status_new, count(*) as 
                   cnts, sum(loan_amnt) as exposure,
                   tot_loan_amt, tot_loan_cnt,
                   sum(annual_inc) as annual_inc_total,
                   (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                   from LoanStats_ins group by dti_bin, loan_status_new')
  
  dti_bin_tran <- reshape(dti_bin, idvar = "dti_bin", 
                          timevar = "loan_status_new", 
                          direction = "wide")
  
dti_bin_tran <-data.frame(dti_bin=dti_bin_tran$dti_bin,
                                 charge_off_Cnts=dti_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=dti_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=dti_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=dti_bin_tran$`exposure.Fully Paid`,
                                         annual_inc_total=dti_bin_tran$`annual_inc_total.Charged Off`,
                                 tot_loan_amt=dti_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=dti_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(dti_bin_tran$`cnts.Charged Off`/(dti_bin_tran$`cnts.Charged Off`+
                                                                                         dti_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(dti_bin_tran$`exposure.Charged Off`/(dti_bin_tran$`exposure.Charged Off`+
                                                                                            dti_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(dti_bin_tran$`cnts.Charged Off`/dti_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(dti_bin_tran$`exposure.Charged Off`/dti_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- dti_bin_tran[,c("dti_bin","pct_cnts_grp")]
  df2 <- dti_bin_tran[,c("dti_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=dti_bin_tran,
                         aes(x=dti_bin_tran$dti_bin,
                             y=dti_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Debt to Incom Ratio Bins", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$dti_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$dti_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$dti_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$dti_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
 
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$dti_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$dti_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "Debt to Income Ratio Bins", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$dti_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$dti_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$dti_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$dti_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
  p3
  
  retList <- list(p,p3)
  return(retList)
  
}
bin_plot(12)
## [[1]]

## 
## [[2]]

bin_plot(10)
## [[1]]

## 
## [[2]]

Inference -> Initially we observed that rate of charge off was little zig-zag in 10 bins graph. -> But as we tried to reduce the bins it became evident that the rate of charge off increases as we move towards higher dti bins. -> So as we expect the individuals with higher debt burden tends to charge off more.

The number of inquiries in past 6 months Distribution and Analysis

#The number of inquiries in past 6 months Distribution and Analysis
summary(LoanStats2$inq_last_6mths)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   1.000   1.007   2.000  33.000       1
ind_na <- which(is.na(LoanStats2$inq_last_6mths))
LoanStats3 <- LoanStats2[-ind_na,]

summary(LoanStats3$inq_last_6mths)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.007   2.000  33.000
plot(LoanStats3$inq_last_6mths, ylab = "No of inquiries in 6 months before")

hist(LoanStats3$inq_last_6mths, breaks = 100,
     xlab = "No of inquiries in 6 months", col="#CCCCFF",
     main="Histogram of No of inquiries in 6 months Distribution")

a <- quantile(LoanStats3$inq_last_6mths,.99)
b <- quantile(LoanStats3$inq_last_6mths,0)
ind_1 <- which(LoanStats3$inq_last_6mths < a & LoanStats3$inq_last_6mths > b)
LoanStats_ins <- LoanStats3[ind_1,]
plot(LoanStats_ins$inq_last_6mths, ylab = "No of inquiries in 6 months after")

hist(LoanStats_ins$inq_last_6mths, breaks = 30,
     xlab = "No of inquiries in 6 months", col="#CCCCFF",
     main="Histogram of No of inquiries in 6 months Distribution")

ind_wo <- which(LoanStats3$inq_last_6mths < 10 )
LoanStats_wo <- LoanStats3[ind_wo,]

library(gmodels)

data.frame(CrossTable(LoanStats3$inq_last_6mths,LoanStats3$loan_status_new,
                             prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19775 
## 
##  
##                           | LoanStats3$loan_status_new 
## LoanStats3$inq_last_6mths | Charged Off |  Fully Paid |   Row Total | 
## --------------------------|-------------|-------------|-------------|
##                         0 |        1165 |        8218 |        9383 | 
##                           |       0.124 |       0.876 |       0.474 | 
## --------------------------|-------------|-------------|-------------|
##                         1 |         878 |        4479 |        5357 | 
##                           |       0.164 |       0.836 |       0.271 | 
## --------------------------|-------------|-------------|-------------|
##                         2 |         493 |        2214 |        2707 | 
##                           |       0.182 |       0.818 |       0.137 | 
## --------------------------|-------------|-------------|-------------|
##                         3 |         310 |        1089 |        1399 | 
##                           |       0.222 |       0.778 |       0.071 | 
## --------------------------|-------------|-------------|-------------|
##                         4 |          87 |         339 |         426 | 
##                           |       0.204 |       0.796 |       0.022 | 
## --------------------------|-------------|-------------|-------------|
##                         5 |          55 |         174 |         229 | 
##                           |       0.240 |       0.760 |       0.012 | 
## --------------------------|-------------|-------------|-------------|
##                         6 |          36 |          89 |         125 | 
##                           |       0.288 |       0.712 |       0.006 | 
## --------------------------|-------------|-------------|-------------|
##                         7 |          26 |          36 |          62 | 
##                           |       0.419 |       0.581 |       0.003 | 
## --------------------------|-------------|-------------|-------------|
##                         8 |          16 |          19 |          35 | 
##                           |       0.457 |       0.543 |       0.002 | 
## --------------------------|-------------|-------------|-------------|
##                         9 |           7 |          12 |          19 | 
##                           |       0.368 |       0.632 |       0.001 | 
## --------------------------|-------------|-------------|-------------|
##                        10 |           0 |           9 |           9 | 
##                           |       0.000 |       1.000 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##                        11 |           2 |           3 |           5 | 
##                           |       0.400 |       0.600 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##                        12 |           2 |           2 |           4 | 
##                           |       0.500 |       0.500 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##                        13 |           1 |           1 |           2 | 
##                           |       0.500 |       0.500 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##                        14 |           3 |           0 |           3 | 
##                           |       1.000 |       0.000 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##                        15 |           1 |           3 |           4 | 
##                           |       0.250 |       0.750 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##                        17 |           1 |           0 |           1 | 
##                           |       1.000 |       0.000 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##                        18 |           0 |           1 |           1 | 
##                           |       0.000 |       1.000 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##                        19 |           0 |           2 |           2 | 
##                           |       0.000 |       1.000 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##                        28 |           0 |           1 |           1 | 
##                           |       0.000 |       1.000 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##                        33 |           1 |           0 |           1 | 
##                           |       1.000 |       0.000 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##              Column Total |        3084 |       16691 |       19775 | 
## --------------------------|-------------|-------------|-------------|
## 
## 
##    t.x         t.y t.Freq prop.row.x  prop.row.y prop.row.Freq prop.col.x
## 1    0 Charged Off   1165          0 Charged Off     0.1241607          0
## 2    1 Charged Off    878          1 Charged Off     0.1638977          1
## 3    2 Charged Off    493          2 Charged Off     0.1821204          2
## 4    3 Charged Off    310          3 Charged Off     0.2215868          3
## 5    4 Charged Off     87          4 Charged Off     0.2042254          4
## 6    5 Charged Off     55          5 Charged Off     0.2401747          5
## 7    6 Charged Off     36          6 Charged Off     0.2880000          6
## 8    7 Charged Off     26          7 Charged Off     0.4193548          7
## 9    8 Charged Off     16          8 Charged Off     0.4571429          8
## 10   9 Charged Off      7          9 Charged Off     0.3684211          9
## 11  10 Charged Off      0         10 Charged Off     0.0000000         10
## 12  11 Charged Off      2         11 Charged Off     0.4000000         11
## 13  12 Charged Off      2         12 Charged Off     0.5000000         12
## 14  13 Charged Off      1         13 Charged Off     0.5000000         13
## 15  14 Charged Off      3         14 Charged Off     1.0000000         14
## 16  15 Charged Off      1         15 Charged Off     0.2500000         15
## 17  17 Charged Off      1         17 Charged Off     1.0000000         17
## 18  18 Charged Off      0         18 Charged Off     0.0000000         18
## 19  19 Charged Off      0         19 Charged Off     0.0000000         19
## 20  28 Charged Off      0         28 Charged Off     0.0000000         28
## 21  33 Charged Off      1         33 Charged Off     1.0000000         33
## 22   0  Fully Paid   8218          0  Fully Paid     0.8758393          0
## 23   1  Fully Paid   4479          1  Fully Paid     0.8361023          1
## 24   2  Fully Paid   2214          2  Fully Paid     0.8178796          2
## 25   3  Fully Paid   1089          3  Fully Paid     0.7784132          3
## 26   4  Fully Paid    339          4  Fully Paid     0.7957746          4
## 27   5  Fully Paid    174          5  Fully Paid     0.7598253          5
## 28   6  Fully Paid     89          6  Fully Paid     0.7120000          6
## 29   7  Fully Paid     36          7  Fully Paid     0.5806452          7
## 30   8  Fully Paid     19          8  Fully Paid     0.5428571          8
## 31   9  Fully Paid     12          9  Fully Paid     0.6315789          9
## 32  10  Fully Paid      9         10  Fully Paid     1.0000000         10
## 33  11  Fully Paid      3         11  Fully Paid     0.6000000         11
## 34  12  Fully Paid      2         12  Fully Paid     0.5000000         12
## 35  13  Fully Paid      1         13  Fully Paid     0.5000000         13
## 36  14  Fully Paid      0         14  Fully Paid     0.0000000         14
## 37  15  Fully Paid      3         15  Fully Paid     0.7500000         15
## 38  17  Fully Paid      0         17  Fully Paid     0.0000000         17
## 39  18  Fully Paid      1         18  Fully Paid     1.0000000         18
## 40  19  Fully Paid      2         19  Fully Paid     1.0000000         19
## 41  28  Fully Paid      1         28  Fully Paid     1.0000000         28
## 42  33  Fully Paid      0         33  Fully Paid     0.0000000         33
##     prop.col.y prop.col.Freq prop.tbl.x  prop.tbl.y prop.tbl.Freq
## 1  Charged Off  3.777562e-01          0 Charged Off  0.0589127686
## 2  Charged Off  2.846952e-01          1 Charged Off  0.0443994943
## 3  Charged Off  1.598573e-01          2 Charged Off  0.0249304678
## 4  Charged Off  1.005188e-01          3 Charged Off  0.0156763590
## 5  Charged Off  2.821012e-02          4 Charged Off  0.0043994943
## 6  Charged Off  1.783398e-02          5 Charged Off  0.0027812895
## 7  Charged Off  1.167315e-02          6 Charged Off  0.0018204804
## 8  Charged Off  8.430610e-03          7 Charged Off  0.0013147914
## 9  Charged Off  5.188067e-03          8 Charged Off  0.0008091024
## 10 Charged Off  2.269780e-03          9 Charged Off  0.0003539823
## 11 Charged Off  0.000000e+00         10 Charged Off  0.0000000000
## 12 Charged Off  6.485084e-04         11 Charged Off  0.0001011378
## 13 Charged Off  6.485084e-04         12 Charged Off  0.0001011378
## 14 Charged Off  3.242542e-04         13 Charged Off  0.0000505689
## 15 Charged Off  9.727626e-04         14 Charged Off  0.0001517067
## 16 Charged Off  3.242542e-04         15 Charged Off  0.0000505689
## 17 Charged Off  3.242542e-04         17 Charged Off  0.0000505689
## 18 Charged Off  0.000000e+00         18 Charged Off  0.0000000000
## 19 Charged Off  0.000000e+00         19 Charged Off  0.0000000000
## 20 Charged Off  0.000000e+00         28 Charged Off  0.0000000000
## 21 Charged Off  3.242542e-04         33 Charged Off  0.0000505689
## 22  Fully Paid  4.923612e-01          0  Fully Paid  0.4155752212
## 23  Fully Paid  2.683482e-01          1  Fully Paid  0.2264981037
## 24  Fully Paid  1.326463e-01          2  Fully Paid  0.1119595449
## 25  Fully Paid  6.524474e-02          3  Fully Paid  0.0550695322
## 26  Fully Paid  2.031035e-02          4  Fully Paid  0.0171428571
## 27  Fully Paid  1.042478e-02          5  Fully Paid  0.0087989886
## 28  Fully Paid  5.332215e-03          6  Fully Paid  0.0045006321
## 29  Fully Paid  2.156851e-03          7  Fully Paid  0.0018204804
## 30  Fully Paid  1.138338e-03          8  Fully Paid  0.0009608091
## 31  Fully Paid  7.189503e-04          9  Fully Paid  0.0006068268
## 32  Fully Paid  5.392127e-04         10  Fully Paid  0.0004551201
## 33  Fully Paid  1.797376e-04         11  Fully Paid  0.0001517067
## 34  Fully Paid  1.198251e-04         12  Fully Paid  0.0001011378
## 35  Fully Paid  5.991253e-05         13  Fully Paid  0.0000505689
## 36  Fully Paid  0.000000e+00         14  Fully Paid  0.0000000000
## 37  Fully Paid  1.797376e-04         15  Fully Paid  0.0001517067
## 38  Fully Paid  0.000000e+00         17  Fully Paid  0.0000000000
## 39  Fully Paid  5.991253e-05         18  Fully Paid  0.0000505689
## 40  Fully Paid  1.198251e-04         19  Fully Paid  0.0001011378
## 41  Fully Paid  5.991253e-05         28  Fully Paid  0.0000505689
## 42  Fully Paid  0.000000e+00         33  Fully Paid  0.0000000000
summary(LoanStats3$inq_last_6mths)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.007   2.000  33.000
summary(LoanStats_wo$inq_last_6mths)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.9853  2.0000  9.0000
library(gmodels)
aaa <- data.frame(CrossTable(LoanStats_wo$inq_last_6mths,LoanStats_wo$loan_status_new,
                             prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19742 
## 
##  
##                             | LoanStats_wo$loan_status_new 
## LoanStats_wo$inq_last_6mths | Charged Off |  Fully Paid |   Row Total | 
## ----------------------------|-------------|-------------|-------------|
##                           0 |        1165 |        8218 |        9383 | 
##                             |       0.124 |       0.876 |       0.475 | 
## ----------------------------|-------------|-------------|-------------|
##                           1 |         878 |        4479 |        5357 | 
##                             |       0.164 |       0.836 |       0.271 | 
## ----------------------------|-------------|-------------|-------------|
##                           2 |         493 |        2214 |        2707 | 
##                             |       0.182 |       0.818 |       0.137 | 
## ----------------------------|-------------|-------------|-------------|
##                           3 |         310 |        1089 |        1399 | 
##                             |       0.222 |       0.778 |       0.071 | 
## ----------------------------|-------------|-------------|-------------|
##                           4 |          87 |         339 |         426 | 
##                             |       0.204 |       0.796 |       0.022 | 
## ----------------------------|-------------|-------------|-------------|
##                           5 |          55 |         174 |         229 | 
##                             |       0.240 |       0.760 |       0.012 | 
## ----------------------------|-------------|-------------|-------------|
##                           6 |          36 |          89 |         125 | 
##                             |       0.288 |       0.712 |       0.006 | 
## ----------------------------|-------------|-------------|-------------|
##                           7 |          26 |          36 |          62 | 
##                             |       0.419 |       0.581 |       0.003 | 
## ----------------------------|-------------|-------------|-------------|
##                           8 |          16 |          19 |          35 | 
##                             |       0.457 |       0.543 |       0.002 | 
## ----------------------------|-------------|-------------|-------------|
##                           9 |           7 |          12 |          19 | 
##                             |       0.368 |       0.632 |       0.001 | 
## ----------------------------|-------------|-------------|-------------|
##                Column Total |        3073 |       16669 |       19742 | 
## ----------------------------|-------------|-------------|-------------|
## 
## 
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x", 
             timevar = "aaa.t.y", 
             direction = "wide")

p <- ggplot()+geom_bar(data=c,
                       aes(x=c$aaa.t.x,
                           y=c$`aaa.t.Freq.Charged Off`),
                       stat="identity",fill = "#CCCCFF", colour="black") + 
  labs(x = "No of inquiries in 6 months", y = "")

p <- p + ggtitle("Number of Charge offs                                                              Percent of Charge offs\n")+
  theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))


p <- p + 
  geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
  scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
                      values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: % Charge off in Bin
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: NA
##     name: Legend
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: <ggproto object: Class RangeDiscrete, Range>
##         range: NULL
##         reset: function
##         train: function
##         super:  <ggproto object: Class RangeDiscrete, Range>
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x, 
                               c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)

p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank())

p

bin_plot <- function(x) {
  
  
  
  no_bins <- x
  
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_ins)/no_bins
  unique(cut2(LoanStats_ins$inq_last_6mths, m=no_cases_bin))
  
  LoanStats_ins$tot_loan_amt <- as.numeric(sum(LoanStats_ins$inq_last_6mths))
  LoanStats_ins$tot_loan_cnt <- as.numeric(nrow(LoanStats_ins))
  
  LoanStats_ins$inq_last_6mths_bin <- cut2(LoanStats_ins$inq_last_6mths, m=no_cases_bin)
  
  
  inq_last_6mths_bin <- sqldf('select inq_last_6mths_bin, loan_status_new, count(*) as 
                   cnts, sum(loan_amnt) as exposure,
                   tot_loan_amt, tot_loan_cnt,
                   sum(annual_inc) as annual_inc_total,
                   (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                   from LoanStats_ins group by inq_last_6mths_bin, loan_status_new')
  
  inq_last_6mths_bin_tran <- reshape(inq_last_6mths_bin, idvar = "inq_last_6mths_bin", 
                          timevar = "loan_status_new", 
                          direction = "wide")
  
inq_last_6mths_bin_tran <-data.frame(inq_last_6mths_bin=inq_last_6mths_bin_tran$inq_last_6mths_bin,
                                 charge_off_Cnts=inq_last_6mths_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=inq_last_6mths_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=inq_last_6mths_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=inq_last_6mths_bin_tran$`exposure.Fully Paid`,
                                         annual_inc_total=inq_last_6mths_bin_tran$`annual_inc_total.Charged Off`,
                                 tot_loan_amt=inq_last_6mths_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=inq_last_6mths_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(inq_last_6mths_bin_tran$`cnts.Charged Off`/(inq_last_6mths_bin_tran$`cnts.Charged Off`+
                                                                                         inq_last_6mths_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(inq_last_6mths_bin_tran$`exposure.Charged Off`/(inq_last_6mths_bin_tran$`exposure.Charged Off`+
                                                                                            inq_last_6mths_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(inq_last_6mths_bin_tran$`cnts.Charged Off`/inq_last_6mths_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(inq_last_6mths_bin_tran$`exposure.Charged Off`/inq_last_6mths_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- inq_last_6mths_bin_tran[,c("inq_last_6mths_bin","pct_cnts_grp")]
  df2 <- inq_last_6mths_bin_tran[,c("inq_last_6mths_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=inq_last_6mths_bin_tran,
                         aes(x=inq_last_6mths_bin_tran$inq_last_6mths_bin,
                             y=inq_last_6mths_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "No of inquiries in 6 months Bins", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$inq_last_6mths_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$inq_last_6mths_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$inq_last_6mths_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$inq_last_6mths_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$inq_last_6mths_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$inq_last_6mths_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "No of inquiries in 6 months Bins", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$inq_last_6mths_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$inq_last_6mths_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$inq_last_6mths_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$inq_last_6mths_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
  p3
    retList <- list(p,p3)
  return(retList)

  
}
bin_plot(11)
## [[1]]

## 
## [[2]]

bin_plot(10)
## [[1]]

## 
## [[2]]

Inference -> We can see the rate of charge off gradually inceases as we move in buckets of hinger number of inquiries. -> it became quite evident when we reduce the number of bins.

The number of months since the borrower’s last delinquency.

#The number of months since the borrower's last delinquency.
summary(LoanStats2$mths_since_last_delinq)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   18.00   34.00   35.75   52.00  115.00   12657
ind_na <- which(is.na(LoanStats2$mths_since_last_delinq))
LoanStats3 <- LoanStats2[-ind_na,]

summary(LoanStats3$mths_since_last_delinq)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   18.00   34.00   35.75   52.00  115.00
plot(LoanStats3$mths_since_last_delinq, ylab = "No of months since last delinquency")

hist(LoanStats3$mths_since_last_delinq, breaks = 20,
     xlab = "No of months since last delinquency", col="#CCCCFF",
     main="Histogram of No of months since last delinquency")

a <- quantile(LoanStats3$mths_since_last_delinq,.95)
b <- quantile(LoanStats3$mths_since_last_delinq,0)
ind_1 <- which(LoanStats3$mths_since_last_delinq < a & LoanStats3$mths_since_last_delinq > b)
LoanStats_ins <- LoanStats3[ind_1,]
plot(LoanStats_ins$mths_since_last_delinq, ylab = "No of months since last delinquency")

hist(LoanStats_ins$mths_since_last_delinq, breaks = 20,
     xlab = "No of months since last delinquency", col="#CCCCFF",
     main="Histogram of No of months since last delinquency")

summary(LoanStats3$mths_since_last_delinq)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   18.00   34.00   35.75   52.00  115.00
summary(LoanStats_ins$mths_since_last_delinq)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   34.00   34.95   49.00   74.00
bin_plot <- function(x) {
  
  no_bins <- x
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_ins)/no_bins
  unique(cut2(LoanStats_ins$mths_since_last_delinq, m=no_cases_bin))
  
  LoanStats_ins$tot_loan_amt <- as.numeric(sum(LoanStats_ins$mths_since_last_delinq))
  LoanStats_ins$tot_loan_cnt <- as.numeric(nrow(LoanStats_ins))
  
  LoanStats_ins$mths_since_last_delinq_bin <- cut2(LoanStats_ins$mths_since_last_delinq, m=no_cases_bin)
  
  
  mths_since_last_delinq_bin <- sqldf('select mths_since_last_delinq_bin, loan_status_new, count(*) as 
                                      cnts, sum(loan_amnt) as exposure,
                                      tot_loan_amt, tot_loan_cnt,
                                      sum(annual_inc) as annual_inc_total,
                                      (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                                      from LoanStats_ins group by mths_since_last_delinq_bin, loan_status_new')
  
  mths_since_last_delinq_bin_tran <- reshape(mths_since_last_delinq_bin, idvar = "mths_since_last_delinq_bin", 
                                             timevar = "loan_status_new", 
                                             direction = "wide")
  
mths_since_last_delinq_bin_tran <-data.frame(mths_since_last_delinq_bin=mths_since_last_delinq_bin_tran$mths_since_last_delinq_bin,
                                 charge_off_Cnts=mths_since_last_delinq_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=mths_since_last_delinq_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=mths_since_last_delinq_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=mths_since_last_delinq_bin_tran$`exposure.Fully Paid`,
                 annual_inc_total=mths_since_last_delinq_bin_tran$`annual_inc_total.Charged Off`,
                                 tot_loan_amt=mths_since_last_delinq_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=mths_since_last_delinq_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(mths_since_last_delinq_bin_tran$`cnts.Charged Off`/(mths_since_last_delinq_bin_tran$`cnts.Charged Off`+
                                                                                         mths_since_last_delinq_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(mths_since_last_delinq_bin_tran$`exposure.Charged Off`/(mths_since_last_delinq_bin_tran$`exposure.Charged Off`+
                                                                                            mths_since_last_delinq_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(mths_since_last_delinq_bin_tran$`cnts.Charged Off`/mths_since_last_delinq_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(mths_since_last_delinq_bin_tran$`exposure.Charged Off`/mths_since_last_delinq_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- mths_since_last_delinq_bin_tran[,c("mths_since_last_delinq_bin","pct_cnts_grp")]
  df2 <- mths_since_last_delinq_bin_tran[,c("mths_since_last_delinq_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=mths_since_last_delinq_bin_tran,
                         aes(x=mths_since_last_delinq_bin_tran$mths_since_last_delinq_bin,
                             y=mths_since_last_delinq_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "No of months since last delinquency", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$mths_since_last_delinq_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$mths_since_last_delinq_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$mths_since_last_delinq_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$mths_since_last_delinq_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$mths_since_last_delinq_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$mths_since_last_delinq_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "No of months since last delinquency", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$mths_since_last_delinq_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$mths_since_last_delinq_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$mths_since_last_delinq_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$mths_since_last_delinq_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
    retList <- list(p,p3)
  return(retList)
  
}
bin_plot(15)
## [[1]]

## 
## [[2]]

bin_plot(12)
## [[1]]

## 
## [[2]]

bin_plot(6)
## [[1]]

## 
## [[2]]

Inference -> We can observe a quite a zig-zag pattern when we have divided the varible for No of months since last delinquency in 10 bins. -> But as converge the number of bins we start to observe concave shaped line graph for the rate of charge offs. -> So wecan deduce that the buckets haveing highest and lowest No of months since last delinquency has slightly higher chances of charge offs.

Total Interest Received to date

#Interest received to date
summary(LoanStats2$total_rec_int)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   854.5  1702.4  2659.9  3369.8 23563.7
plot(LoanStats2$total_rec_int, ylab = "Interest received to date")

hist(LoanStats2$total_rec_int, breaks = 30,
     xlab = "Interest received to date", col="#CCCCFF",
     main="Histogram of Interest received to date")

a <- quantile(LoanStats2$total_rec_int,.90)
b <- quantile(LoanStats2$total_rec_int,0)
ind_la <- which(LoanStats2$total_rec_int < a & LoanStats2$total_rec_int > b)
LoanStats_la <- LoanStats2[ind_la,]
plot(LoanStats_la$total_rec_int, ylab = "Interest received to date")

hist(LoanStats_la$total_rec_int, breaks = 30,
     xlab = "Interest received to date", col="#CCCCFF",
     main="Histogram of Interest received to date")

summary(LoanStats2$total_rec_int)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   854.5  1702.4  2659.9  3369.8 23563.7
summary(LoanStats_la$total_rec_int)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     7.2   790.2  1498.3  1909.6  2693.1  6152.0
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]$loan_amnt))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]))





bin_plot <- function(x) {
  
  no_bins <- x
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_la)/no_bins
  unique(cut2(LoanStats_la$total_rec_int, m=no_cases_bin))
  
  LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$total_rec_int))
  LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
  
  LoanStats_la$total_rec_int_bin <- cut2(LoanStats_la$total_rec_int, m=no_cases_bin)
  
  
  total_rec_int_bin <- sqldf('select total_rec_int_bin, loan_status_new, count(*) as 
                                      cnts, sum(loan_amnt) as exposure,
                                      tot_loan_amt, tot_loan_cnt,
                                      sum(annual_inc) as annual_inc_total,
                                      (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                                      from LoanStats_la group by total_rec_int_bin, loan_status_new')
  
  total_rec_int_bin_tran <- reshape(total_rec_int_bin, idvar = "total_rec_int_bin", 
                                             timevar = "loan_status_new", 
                                             direction = "wide")
  
  total_rec_int_bin_tran <-data.frame(total_rec_int_bin=total_rec_int_bin_tran$total_rec_int_bin,
                                 charge_off_Cnts=total_rec_int_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=total_rec_int_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=total_rec_int_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=total_rec_int_bin_tran$`exposure.Fully Paid`,
                                         annual_inc_total=total_rec_int_bin_tran$`annual_inc_total.Charged Off`,
                                 tot_loan_amt=total_rec_int_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=total_rec_int_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(total_rec_int_bin_tran$`cnts.Charged Off`/(total_rec_int_bin_tran$`cnts.Charged Off`+
                                                                                         total_rec_int_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(total_rec_int_bin_tran$`exposure.Charged Off`/(total_rec_int_bin_tran$`exposure.Charged Off`+
                                                                                            total_rec_int_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(total_rec_int_bin_tran$`cnts.Charged Off`/total_rec_int_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(total_rec_int_bin_tran$`exposure.Charged Off`/total_rec_int_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- total_rec_int_bin_tran[,c("total_rec_int_bin","pct_cnts_grp")]
  df2 <- total_rec_int_bin_tran[,c("total_rec_int_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=total_rec_int_bin_tran,
                         aes(x=total_rec_int_bin_tran$total_rec_int_bin,
                             y=total_rec_int_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Total Interest Received to date", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$total_rec_int_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$total_rec_int_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$total_rec_int_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$total_rec_int_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$total_rec_int_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$total_rec_int_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "Total Interest Received to date", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$total_rec_int_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$total_rec_int_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$total_rec_int_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$total_rec_int_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
    retList <- list(p,p3)
  return(retList)
  
}
bin_plot(12)
## [[1]]

## 
## [[2]]

bin_plot(8)
## [[1]]

## 
## [[2]]

bin_plot(6)
## [[1]]

## 
## [[2]]

Inference -> Initial zig-zag pattern was observed in more number of bins. -> We can observe here that for Interest received to date for the first bin has maximum charge off and as we move along first three bins it gradually decreases. -> After reducing the bins, we can observe a trend of intial decrease in charge off rate as we move from lower interest received bins towards higher interest received bins but later it becomes almost constant. -> Based on what we have observed we can deduce that loans in highest interest received till date has maximum chances to charge off.

Principal received to date

# total_rec_prncp
summary(LoanStats2$total_rec_prncp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    5261   10000   10944   15000   35000
plot(LoanStats2$total_rec_prncp, ylab = "Principal received to date")

hist(LoanStats2$total_rec_prncp, breaks = 20,
     xlab = "Interest received to date", col="#CCCCFF",
     main="Histogram of Interest received to date")

a <- quantile(LoanStats2$total_rec_prncp,.90)
b <- quantile(LoanStats2$total_rec_prncp,0)
ind_la <- which(LoanStats2$total_rec_prncp < a & LoanStats2$total_rec_prncp > b)
LoanStats_la <- LoanStats2[ind_la,]
plot(LoanStats_la$total_rec_prncp, ylab = "Principal received to date")

hist(LoanStats_la$total_rec_prncp, breaks = 10,
     xlab = "Interest received to date", col="#CCCCFF",
     main="Histogram of Principal received to date")

summary(LoanStats2$total_rec_prncp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    5261   10000   10944   15000   35000
summary(LoanStats_la$total_rec_prncp)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    32.51  5000.00  8927.02  9253.20 13000.00 20855.22
bin_plot <- function(x) {
  
  no_bins <- x
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_la)/no_bins
  unique(cut2(LoanStats_la$total_rec_prncp, m=no_cases_bin))
  
  LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$total_rec_prncp))
  LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
  
  LoanStats_la$total_rec_prncp_bin <- cut2(LoanStats_la$total_rec_prncp, m=no_cases_bin)
  
  
  total_rec_prncp_bin <- sqldf('select total_rec_prncp_bin, loan_status_new, count(*) as 
                                      cnts, sum(loan_amnt) as exposure,
                                      tot_loan_amt, tot_loan_cnt,
                                      sum(annual_inc) as annual_inc_total,
                                      (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                                      from LoanStats_la group by total_rec_prncp_bin, loan_status_new')
  
  total_rec_prncp_bin_tran <- reshape(total_rec_prncp_bin, idvar = "total_rec_prncp_bin", 
                                             timevar = "loan_status_new", 
                                             direction = "wide")
  
  total_rec_prncp_bin_tran <-data.frame(total_rec_prncp_bin=total_rec_prncp_bin_tran$total_rec_prncp_bin,
                                 charge_off_Cnts=total_rec_prncp_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=total_rec_prncp_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=total_rec_prncp_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=total_rec_prncp_bin_tran$`exposure.Fully Paid`,
                                         annual_inc_total=total_rec_prncp_bin_tran$`annual_inc_total.Charged Off`,
                                 tot_loan_amt=total_rec_prncp_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=total_rec_prncp_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(total_rec_prncp_bin_tran$`cnts.Charged Off`/(total_rec_prncp_bin_tran$`cnts.Charged Off`+
                                                                                         total_rec_prncp_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(total_rec_prncp_bin_tran$`exposure.Charged Off`/(total_rec_prncp_bin_tran$`exposure.Charged Off`+
                                                                                            total_rec_prncp_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(total_rec_prncp_bin_tran$`cnts.Charged Off`/total_rec_prncp_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(total_rec_prncp_bin_tran$`exposure.Charged Off`/total_rec_prncp_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- total_rec_prncp_bin_tran[,c("total_rec_prncp_bin","pct_cnts_grp")]
  df2 <- total_rec_prncp_bin_tran[,c("total_rec_prncp_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=total_rec_prncp_bin_tran,
                         aes(x=total_rec_prncp_bin_tran$total_rec_prncp_bin,
                             y=total_rec_prncp_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Principal received to date Bins", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$total_rec_prncp_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$total_rec_prncp_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$total_rec_prncp_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$total_rec_prncp_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$total_rec_prncp_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$total_rec_prncp_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "Principal received to date Bins", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$total_rec_prncp_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$total_rec_prncp_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$total_rec_prncp_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$total_rec_prncp_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
    retList <- list(p,p3)
  return(retList)
  
}
bin_plot(12)
## [[1]]

## 
## [[2]]

bin_plot(10)
## [[1]]

## 
## [[2]]

Inference -> It is quite evident from the line graph that maximum charge off rate is observed in the lowest bucket of the principal received till date. -> We also know that the during the loan tenure a individual pays minimum part of principal at the start of repayment schedule and which gradully increases so based on these 2 hypothesis we can extrapolate that the loans have a tendency to charge off in their early period of repayment schedule.

Payments received to date for portion of total amount funded by investors

# total_pymnt_inv
summary(LoanStats2$total_pymnt_inv)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    6012   10965   13008   17675   58564
plot(LoanStats2$total_pymnt_inv, ylab = "Payments received to date for portion of total amount funded by investors")

hist(LoanStats2$total_pymnt_inv, breaks = 20,
     xlab = "Payments received from investors", col="#CCCCFF",
     main="Histogram of Payments received to date for portion of total amount funded by investors")

a <- quantile(LoanStats2$total_pymnt_inv,.80)
b <- quantile(LoanStats2$total_pymnt_inv,0)
ind_la <- which(LoanStats2$total_pymnt_inv < a & LoanStats2$total_pymnt_inv > b)
LoanStats_la <- LoanStats2[ind_la,]
plot(LoanStats_la$total_pymnt_inv, ylab = "Payments received to date for portion of total amount funded by investors")

hist(LoanStats_la$total_pymnt_inv, breaks = 10,
     xlab = "Payments received from investors", col="#CCCCFF",
     main="Histogram of Payments received to date for portion of total amount funded by investors")

summary(LoanStats2$total_pymnt_inv)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    6012   10965   13008   17675   58564
summary(LoanStats_la$total_pymnt_inv)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.8  5233.8  8864.0  9274.3 13282.5 19890.5
bin_plot <- function(x) {
  
  no_bins <- x
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_la)/no_bins
  unique(cut2(LoanStats_la$total_pymnt_inv, m=no_cases_bin))
  
  LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$total_pymnt_inv))
  LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
  
  LoanStats_la$totalpymntinv_bin <- cut2(LoanStats_la$total_pymnt_inv, m=no_cases_bin)
  
  
  totalpymntinv_bin_tran <- sqldf('select totalpymntinv_bin, loan_status_new, count(*) as 
                                      cnts, sum(loan_amnt) as exposure,
                                      tot_loan_amt, tot_loan_cnt,
                                      sum(annual_inc) as annual_inc_total,
                                      (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                                      from LoanStats_la group by totalpymntinv_bin, loan_status_new')
  
  totalpymntinv_bin_tran <- reshape(totalpymntinv_bin_tran , idvar = "totalpymntinv_bin", 
                                             timevar = "loan_status_new", 
                                             direction = "wide")
  
  totalpymntinv_bin_tran <-data.frame(totalpymntinv_bin=totalpymntinv_bin_tran$totalpymntinv_bin,
                                 charge_off_Cnts=totalpymntinv_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=totalpymntinv_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=totalpymntinv_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=totalpymntinv_bin_tran$`exposure.Fully Paid`,
                                         annual_inc_total=totalpymntinv_bin_tran$`annual_inc_total.Charged Off`,
                                 tot_loan_amt=totalpymntinv_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=totalpymntinv_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(totalpymntinv_bin_tran$`cnts.Charged Off`/(totalpymntinv_bin_tran$`cnts.Charged Off`+
                                                                                         totalpymntinv_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(totalpymntinv_bin_tran$`exposure.Charged Off`/(totalpymntinv_bin_tran$`exposure.Charged Off`+
                                                                                            totalpymntinv_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(totalpymntinv_bin_tran$`cnts.Charged Off`/totalpymntinv_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(totalpymntinv_bin_tran$`exposure.Charged Off`/totalpymntinv_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- totalpymntinv_bin_tran[,c("totalpymntinv_bin","pct_cnts_grp")]
  df2 <- totalpymntinv_bin_tran[,c("totalpymntinv_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=totalpymntinv_bin_tran,
                         aes(x=totalpymntinv_bin_tran$totalpymntinv_bin,
                             y=totalpymntinv_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Payments received from investors Bins", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$totalpymntinv_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$totalpymntinv_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$totalpymntinv_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$totalpymntinv_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$totalpymntinv_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$totalpymntinv_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "Payments received from investors Bins", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$totalpymntinv_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$totalpymntinv_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$totalpymntinv_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$totalpymntinv_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
    retList <- list(p,p3)
  return(retList)
  
}
bin_plot(12)
## [[1]]

## 
## [[2]]

bin_plot(10)
## [[1]]

## 
## [[2]]

The total number of credit lines currently in the borrower’s credit file

# The total number of credit lines currently in the borrower's credit file
ind <- which(is.na(LoanStats2$total_acc))
LoanStats3 <- LoanStats2[-ind,]

summary(LoanStats3$total_acc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   14.00   21.00   22.91   30.00   90.00
plot(LoanStats3$total_acc, 
     ylab = "Total number of credit lines")

hist(LoanStats3$total_acc, breaks = 30,
     xlab = "Total number of credit lines", col="#CCCCFF",
     main="Histogram of Total number of credit lines")

a <- quantile(LoanStats3$total_acc,.90)
b <- quantile(LoanStats3$total_acc,0)
ind_la <- which(LoanStats3$total_acc < a & LoanStats3$total_acc > b)
LoanStats_la <- LoanStats3[ind_la,]
plot(LoanStats_la$total_acc, 
     ylab = "Total number of credit lines")

hist(LoanStats_la$total_acc, breaks = 20,
     xlab = "Total number of credit lines", col="#CCCCFF",
     main="Histogram of Total number of credit lines")

summary(LoanStats3$total_acc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   14.00   21.00   22.91   30.00   90.00
summary(LoanStats_la$total_acc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0    14.0    20.0    20.3    27.0    38.0
bin_plot <- function(x) {
  
  no_bins <- x
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_la)/no_bins
  unique(cut2(LoanStats_la$total_acc, m=no_cases_bin))
  
  LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$total_acc))
  LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
  
  LoanStats_la$total_acc_bin <- cut2(LoanStats_la$total_acc, m=no_cases_bin)
  
  
  total_acc_bin <- sqldf('select total_acc_bin, loan_status_new, count(*) as 
                                      cnts, sum(loan_amnt) as exposure,
                                      tot_loan_amt, tot_loan_cnt,
                                      sum(annual_inc) as annual_inc_total,
                                      (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                                      from LoanStats_la group by total_acc_bin, loan_status_new')
  
  total_acc_bin_tran <- reshape(total_acc_bin, idvar = "total_acc_bin", 
                                             timevar = "loan_status_new", 
                                             direction = "wide")
  
  total_acc_bin_tran <-data.frame(total_acc_bin=total_acc_bin_tran$total_acc_bin,
                                 charge_off_Cnts=total_acc_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=total_acc_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=total_acc_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=total_acc_bin_tran$`exposure.Fully Paid`,
                                         annual_inc_total=total_acc_bin_tran$`annual_inc_total.Charged Off`,
                                 tot_loan_amt=total_acc_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=total_acc_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(total_acc_bin_tran$`cnts.Charged Off`/(total_acc_bin_tran$`cnts.Charged Off`+
                                                                                         total_acc_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(total_acc_bin_tran$`exposure.Charged Off`/(total_acc_bin_tran$`exposure.Charged Off`+
                                                                                            total_acc_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(total_acc_bin_tran$`cnts.Charged Off`/total_acc_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(total_acc_bin_tran$`exposure.Charged Off`/total_acc_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- total_acc_bin_tran[,c("total_acc_bin","pct_cnts_grp")]
  df2 <- total_acc_bin_tran[,c("total_acc_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=total_acc_bin_tran,
                         aes(x=total_acc_bin_tran$total_acc_bin,
                             y=total_acc_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Total number of credit lines Bins", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$total_acc_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$total_acc_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$total_acc_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$total_acc_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$total_acc_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$total_acc_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "Total number of credit lines Bins", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$total_acc_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$total_acc_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$total_acc_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$total_acc_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
    retList <- list(p,p3)
  return(retList)
  
}
bin_plot(12)
## [[1]]

## 
## [[2]]

bin_plot(10)
## [[1]]

## 
## [[2]]

bin_plot(8)
## [[1]]

## 
## [[2]]

Inference -> Intial inspection of the graph with 10 bins suggests a almost constant rate of charge off in all the bins. -> But as we reduce the number of bins we can observe the rate of charge off is maximum in the first bucket and after decreasing it becomes constant. -> So we can say that the individuals with lower number of credit lines hade more tendancy to charge off.

Total credit revolving balance

# Total credit revolving balance

summary(LoanStats2$revol_bal)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    5459   10681   15231   18658  465731
plot(LoanStats2$revol_bal, 
     ylab = "Total credit revolving balance")

hist(LoanStats2$revol_bal, breaks = 30,
     xlab = "Total credit revolving balance", col="#CCCCFF",
     main="Histogram of Total credit revolving balance")

a <- quantile(LoanStats2$revol_bal,.75)
b <- quantile(LoanStats2$revol_bal,0)
ind_la <- which(LoanStats2$revol_bal < a & LoanStats2$revol_bal > b)
LoanStats_la <- LoanStats2[ind_la,]
plot(LoanStats_la$revol_bal, 
     ylab = "Total credit revolving balance")

hist(LoanStats_la$revol_bal, breaks = 20,
     xlab = "Total credit revolving balance", col="#CCCCFF",
     main="Histogram of Total credit revolving balance")

summary(LoanStats2$revol_bal)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    5459   10681   15231   18658  465731
summary(LoanStats_la$revol_bal)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    4396    8121    8479   12340   18651
bin_plot <- function(x) {
  
  no_bins <- x
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_la)/no_bins
  unique(cut2(LoanStats_la$revol_bal, m=no_cases_bin))
  
  LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$revol_bal))
  LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
  
  LoanStats_la$revol_bal_bin <- cut2(LoanStats_la$revol_bal, m=no_cases_bin)
  
  
  revol_bal_bin <- sqldf('select revol_bal_bin, loan_status_new, count(*) as 
                                      cnts, sum(loan_amnt) as exposure,
                                      tot_loan_amt, tot_loan_cnt,
                                      sum(annual_inc) as annual_inc_total,
                                      (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                                      from LoanStats_la group by revol_bal_bin, loan_status_new')
  
  revol_bal_bin_tran <- reshape(revol_bal_bin, idvar = "revol_bal_bin", 
                                             timevar = "loan_status_new", 
                                             direction = "wide")
  
  revol_bal_bin_tran <-data.frame(revol_bal_bin=revol_bal_bin_tran$revol_bal_bin,
                                 charge_off_Cnts=revol_bal_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=revol_bal_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=revol_bal_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=revol_bal_bin_tran$`exposure.Fully Paid`,
                                         annual_inc_total=revol_bal_bin_tran$`annual_inc_total.Charged Off`,
                                 tot_loan_amt=revol_bal_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=revol_bal_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(revol_bal_bin_tran$`cnts.Charged Off`/(revol_bal_bin_tran$`cnts.Charged Off`+
                                                                                         revol_bal_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(revol_bal_bin_tran$`exposure.Charged Off`/(revol_bal_bin_tran$`exposure.Charged Off`+
                                                                                            revol_bal_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(revol_bal_bin_tran$`cnts.Charged Off`/revol_bal_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(revol_bal_bin_tran$`exposure.Charged Off`/revol_bal_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- revol_bal_bin_tran[,c("revol_bal_bin","pct_cnts_grp")]
  df2 <- revol_bal_bin_tran[,c("revol_bal_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=revol_bal_bin_tran,
                         aes(x=revol_bal_bin_tran$revol_bal_bin,
                             y=revol_bal_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Total credit revolving balance Bins", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$revol_bal_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$revol_bal_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$revol_bal_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$revol_bal_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$revol_bal_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$revol_bal_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "Total credit revolving balance Bins", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$revol_bal_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$revol_bal_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$revol_bal_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$revol_bal_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
    retList <- list(p,p3)
  return(retList)
  
}
bin_plot(12)
## [[1]]

## 
## [[2]]

bin_plot(10)
## [[1]]

## 
## [[2]]

bin_plot(8)
## [[1]]

## 
## [[2]]

Inference -> Almost no pattern observed between Total credit revolving balance and rate of charge off.

Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.

# Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.

ind <- which(is.na(LoanStats2$revol_util_num))
LoanStats3 <- LoanStats2[-ind,]

#cleaning the percentage which is above 100
ind <- which(LoanStats3$revol_util_num > 100)
LoanStats3 <- LoanStats3[-ind,]


summary(LoanStats3$revol_util_num)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   34.30   55.60   54.35   75.70   99.90
plot(LoanStats3$revol_util_num, 
     ylab = "Revolving line utilization rate")

hist(LoanStats3$revol_util_num, breaks = 20,
     xlab = "Revolving line utilization rate", col="#CCCCFF",
     main="Histogram of Revolving line utilization rate")

a <- quantile(LoanStats3$revol_util_num,1)
b <- quantile(LoanStats3$revol_util_num,.1)
ind_la <- which(LoanStats3$revol_util_num < a & LoanStats3$revol_util_num > b)
LoanStats_la <- LoanStats3[ind_la,]
plot(LoanStats_la$revol_util_num, 
     ylab = "Revolving line utilization rate")

hist(LoanStats_la$revol_util_num, breaks = 30,
     xlab = "Revolving line utilization rate", col="#CCCCFF",
     main="Histogram of Revolving line utilization rate")

summary(LoanStats3$revol_util_num)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   34.30   55.60   54.35   75.70   99.90
summary(LoanStats_la$revol_util_num)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17.90   41.10   59.40   59.38   77.85   99.80
bin_plot <- function(x) {
  
  no_bins <- x
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_la)/no_bins
  unique(cut2(LoanStats_la$revol_util_num, m=no_cases_bin))
  
  LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$revol_util_num))
  LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
  
  LoanStats_la$revol_util_num_bin <- cut2(LoanStats_la$revol_util_num, m=no_cases_bin)
  
  
  revol_util_num_bin <- sqldf('select revol_util_num_bin, loan_status_new, count(*) as 
                                      cnts, sum(loan_amnt) as exposure,
                                      tot_loan_amt, tot_loan_cnt,
                                      sum(annual_inc) as annual_inc_total,
                                      (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                                      from LoanStats_la group by revol_util_num_bin, loan_status_new')
  
  revol_util_num_bin_tran <- reshape(revol_util_num_bin, idvar = "revol_util_num_bin", 
                                             timevar = "loan_status_new", 
                                             direction = "wide")
  
  revol_util_num_bin_tran <-data.frame(revol_util_num_bin=revol_util_num_bin_tran$revol_util_num_bin,
                                 charge_off_Cnts=revol_util_num_bin_tran$`cnts.Charged Off`,
                                 Fully_paid_Cnts=revol_util_num_bin_tran$`cnts.Fully Paid`,
                                 charge_off_Exp=revol_util_num_bin_tran$`exposure.Charged Off`,
                                 Fully_paid_Exp=revol_util_num_bin_tran$`exposure.Fully Paid`,
                                         annual_inc_total=revol_util_num_bin_tran$`annual_inc_total.Charged Off`,
                                 tot_loan_amt=revol_util_num_bin_tran$`tot_loan_amt.Charged Off`,
                                 tot_loan_cnt=revol_util_num_bin_tran$`tot_loan_cnt.Charged Off`,
                                 pct_cnts_grp=(revol_util_num_bin_tran$`cnts.Charged Off`/(revol_util_num_bin_tran$`cnts.Charged Off`+
                                                                                         revol_util_num_bin_tran$`cnts.Fully Paid`))*100,
                                 pct_exp_grp=(revol_util_num_bin_tran$`exposure.Charged Off`/(revol_util_num_bin_tran$`exposure.Charged Off`+
                                                                                            revol_util_num_bin_tran$`exposure.Fully Paid`))*100,
                                 pct_cnts_tot=(revol_util_num_bin_tran$`cnts.Charged Off`/revol_util_num_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                 pct_exp_tot=(revol_util_num_bin_tran$`exposure.Charged Off`/revol_util_num_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- revol_util_num_bin_tran[,c("revol_util_num_bin","pct_cnts_grp")]
  df2 <- revol_util_num_bin_tran[,c("revol_util_num_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=revol_util_num_bin_tran,
                         aes(x=revol_util_num_bin_tran$revol_util_num_bin,
                             y=revol_util_num_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Revolving line utilization rate Bins", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$revol_util_num_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$revol_util_num_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$revol_util_num_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$revol_util_num_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$revol_util_num_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$revol_util_num_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "Revolving line utilization rate Bins", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$revol_util_num_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$revol_util_num_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$revol_util_num_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$revol_util_num_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
    retList <- list(p,p3)
  return(retList)
  
}
bin_plot(12)
## [[1]]

## 
## [[2]]

bin_plot(10)
## [[1]]

## 
## [[2]]

Inference -> This is quite clear here we observe that as we move towards higher revolving utilization rate charge off rate increases. -> We can deduce that the individual who have used more there credit lines tends to charge off more.

Analysing Categorical Variables

The number of payments on the loan. Values are in months and can be either 36 or 60.

unique(LoanStats2$term)
## [1] "60 months" "36 months"
library(gmodels)
aaa <- data.frame(CrossTable(LoanStats2$term,LoanStats2$loan_status_new,
                             prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19776 
## 
##  
##                 | LoanStats2$loan_status_new 
## LoanStats2$term | Charged Off |  Fully Paid |   Row Total | 
## ----------------|-------------|-------------|-------------|
##       36 months |        1715 |       12394 |       14109 | 
##                 |       0.122 |       0.878 |       0.713 | 
## ----------------|-------------|-------------|-------------|
##       60 months |        1369 |        4298 |        5667 | 
##                 |       0.242 |       0.758 |       0.287 | 
## ----------------|-------------|-------------|-------------|
##    Column Total |        3084 |       16692 |       19776 | 
## ----------------|-------------|-------------|-------------|
## 
## 
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x", 
             timevar = "aaa.t.y", 
             direction = "wide")


p <- ggplot()+geom_bar(data=c,
                       aes(x=c$aaa.t.x,
                           y=c$`aaa.t.Freq.Charged Off`),
                       stat="identity",fill = "#CCCCFF", colour="black") + 
  labs(x = "Loan Term", y = "")

p <- p + ggtitle("Number of Charge offs                                                              Percent of Charge offs\n")+
  theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))


p <- p + 
  geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
                    values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: % Charge off in Bin
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: NA
##     name: Legend
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: <ggproto object: Class RangeDiscrete, Range>
##         range: NULL
##         reset: function
##         train: function
##         super:  <ggproto object: Class RangeDiscrete, Range>
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x, 
                               c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)

p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank())

p

Inference -> This is evident from the graph despite having large number of charge offs in loans with 36 month term the loans in 60 month term tends to charge off more. -> Longer term duration increases the chances of charge off.

Loan Grade assigned by Lending club

library(gmodels)
#grade vs loan status
unique(LoanStats2$grade)
## [1] "C" "B" "D" "A" "E" "F" "G"
aaa <- data.frame(CrossTable(LoanStats2$grade,LoanStats2$loan_status_new,
                             prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19776 
## 
##  
##                  | LoanStats2$loan_status_new 
## LoanStats2$grade | Charged Off |  Fully Paid |   Row Total | 
## -----------------|-------------|-------------|-------------|
##                A |         245 |        3644 |        3889 | 
##                  |       0.063 |       0.937 |       0.197 | 
## -----------------|-------------|-------------|-------------|
##                B |         670 |        5088 |        5758 | 
##                  |       0.116 |       0.884 |       0.291 | 
## -----------------|-------------|-------------|-------------|
##                C |         690 |        3503 |        4193 | 
##                  |       0.165 |       0.835 |       0.212 | 
## -----------------|-------------|-------------|-------------|
##                D |         673 |        2453 |        3126 | 
##                  |       0.215 |       0.785 |       0.158 | 
## -----------------|-------------|-------------|-------------|
##                E |         477 |        1344 |        1821 | 
##                  |       0.262 |       0.738 |       0.092 | 
## -----------------|-------------|-------------|-------------|
##                F |         243 |         480 |         723 | 
##                  |       0.336 |       0.664 |       0.037 | 
## -----------------|-------------|-------------|-------------|
##                G |          86 |         180 |         266 | 
##                  |       0.323 |       0.677 |       0.013 | 
## -----------------|-------------|-------------|-------------|
##     Column Total |        3084 |       16692 |       19776 | 
## -----------------|-------------|-------------|-------------|
## 
## 
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x", 
             timevar = "aaa.t.y", 
             direction = "wide")


p <- ggplot()+geom_bar(data=c,
                       aes(x=c$aaa.t.x,
                           y=c$`aaa.t.Freq.Charged Off`),
                       stat="identity",fill = "#CCCCFF", colour="black") + 
  labs(x = "Loan Grade", y = "")

p <- p + ggtitle("Number of Charge offs                                                              Percent of Charge offs\n")+
  theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))


p <- p + 
  geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
                    values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: % Charge off in Bin
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: NA
##     name: Legend
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: <ggproto object: Class RangeDiscrete, Range>
##         range: NULL
##         reset: function
##         train: function
##         super:  <ggproto object: Class RangeDiscrete, Range>
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x, 
                               c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)

p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank())

p

Inference -> As expected the charge off rate increases aswe move towards lower Loan grade.

Loan Sub Grade assigned by Lending club

library(gmodels)
#grade vs loan status
unique(LoanStats2$sub_grade)
##  [1] "C5" "C3" "B5" "C1" "D1" "A1" "C4" "B4" "B3" "C2" "A3" "A4" "A5" "D2"
## [15] "A2" "B1" "B2" "E4" "D3" "F3" "E3" "D4" "F1" "E1" "E5" "G4" "E2" "D5"
## [29] "F2" "G3" "F5" "G1" "F4" "G2" "G5"
aaa <- data.frame(CrossTable(LoanStats2$sub_grade,LoanStats2$loan_status_new,
                             prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19776 
## 
##  
##                      | LoanStats2$loan_status_new 
## LoanStats2$sub_grade | Charged Off |  Fully Paid |   Row Total | 
## ---------------------|-------------|-------------|-------------|
##                   A1 |          10 |         353 |         363 | 
##                      |       0.028 |       0.972 |       0.018 | 
## ---------------------|-------------|-------------|-------------|
##                   A2 |          34 |         475 |         509 | 
##                      |       0.067 |       0.933 |       0.026 | 
## ---------------------|-------------|-------------|-------------|
##                   A3 |          43 |         611 |         654 | 
##                      |       0.066 |       0.934 |       0.033 | 
## ---------------------|-------------|-------------|-------------|
##                   A4 |          77 |        1128 |        1205 | 
##                      |       0.064 |       0.936 |       0.061 | 
## ---------------------|-------------|-------------|-------------|
##                   A5 |          81 |        1077 |        1158 | 
##                      |       0.070 |       0.930 |       0.059 | 
## ---------------------|-------------|-------------|-------------|
##                   B1 |          69 |         750 |         819 | 
##                      |       0.084 |       0.916 |       0.041 | 
## ---------------------|-------------|-------------|-------------|
##                   B2 |          99 |         876 |         975 | 
##                      |       0.102 |       0.898 |       0.049 | 
## ---------------------|-------------|-------------|-------------|
##                   B3 |         166 |        1193 |        1359 | 
##                      |       0.122 |       0.878 |       0.069 | 
## ---------------------|-------------|-------------|-------------|
##                   B4 |         150 |        1080 |        1230 | 
##                      |       0.122 |       0.878 |       0.062 | 
## ---------------------|-------------|-------------|-------------|
##                   B5 |         186 |        1189 |        1375 | 
##                      |       0.135 |       0.865 |       0.070 | 
## ---------------------|-------------|-------------|-------------|
##                   C1 |         150 |         911 |        1061 | 
##                      |       0.141 |       0.859 |       0.054 | 
## ---------------------|-------------|-------------|-------------|
##                   C2 |         171 |         888 |        1059 | 
##                      |       0.161 |       0.839 |       0.054 | 
## ---------------------|-------------|-------------|-------------|
##                   C3 |         148 |         653 |         801 | 
##                      |       0.185 |       0.815 |       0.041 | 
## ---------------------|-------------|-------------|-------------|
##                   C4 |         119 |         539 |         658 | 
##                      |       0.181 |       0.819 |       0.033 | 
## ---------------------|-------------|-------------|-------------|
##                   C5 |         102 |         512 |         614 | 
##                      |       0.166 |       0.834 |       0.031 | 
## ---------------------|-------------|-------------|-------------|
##                   D1 |         102 |         433 |         535 | 
##                      |       0.191 |       0.809 |       0.027 | 
## ---------------------|-------------|-------------|-------------|
##                   D2 |         160 |         623 |         783 | 
##                      |       0.204 |       0.796 |       0.040 | 
## ---------------------|-------------|-------------|-------------|
##                   D3 |         144 |         538 |         682 | 
##                      |       0.211 |       0.789 |       0.034 | 
## ---------------------|-------------|-------------|-------------|
##                   D4 |         143 |         439 |         582 | 
##                      |       0.246 |       0.754 |       0.029 | 
## ---------------------|-------------|-------------|-------------|
##                   D5 |         124 |         420 |         544 | 
##                      |       0.228 |       0.772 |       0.028 | 
## ---------------------|-------------|-------------|-------------|
##                   E1 |         126 |         352 |         478 | 
##                      |       0.264 |       0.736 |       0.024 | 
## ---------------------|-------------|-------------|-------------|
##                   E2 |         117 |         316 |         433 | 
##                      |       0.270 |       0.730 |       0.022 | 
## ---------------------|-------------|-------------|-------------|
##                   E3 |          84 |         273 |         357 | 
##                      |       0.235 |       0.765 |       0.018 | 
## ---------------------|-------------|-------------|-------------|
##                   E4 |          81 |         207 |         288 | 
##                      |       0.281 |       0.719 |       0.015 | 
## ---------------------|-------------|-------------|-------------|
##                   E5 |          69 |         196 |         265 | 
##                      |       0.260 |       0.740 |       0.013 | 
## ---------------------|-------------|-------------|-------------|
##                   F1 |          72 |         153 |         225 | 
##                      |       0.320 |       0.680 |       0.011 | 
## ---------------------|-------------|-------------|-------------|
##                   F2 |          51 |         116 |         167 | 
##                      |       0.305 |       0.695 |       0.008 | 
## ---------------------|-------------|-------------|-------------|
##                   F3 |          38 |          92 |         130 | 
##                      |       0.292 |       0.708 |       0.007 | 
## ---------------------|-------------|-------------|-------------|
##                   F4 |          47 |          71 |         118 | 
##                      |       0.398 |       0.602 |       0.006 | 
## ---------------------|-------------|-------------|-------------|
##                   F5 |          35 |          48 |          83 | 
##                      |       0.422 |       0.578 |       0.004 | 
## ---------------------|-------------|-------------|-------------|
##                   G1 |          24 |          51 |          75 | 
##                      |       0.320 |       0.680 |       0.004 | 
## ---------------------|-------------|-------------|-------------|
##                   G2 |          20 |          37 |          57 | 
##                      |       0.351 |       0.649 |       0.003 | 
## ---------------------|-------------|-------------|-------------|
##                   G3 |          20 |          28 |          48 | 
##                      |       0.417 |       0.583 |       0.002 | 
## ---------------------|-------------|-------------|-------------|
##                   G4 |           8 |          39 |          47 | 
##                      |       0.170 |       0.830 |       0.002 | 
## ---------------------|-------------|-------------|-------------|
##                   G5 |          14 |          25 |          39 | 
##                      |       0.359 |       0.641 |       0.002 | 
## ---------------------|-------------|-------------|-------------|
##         Column Total |        3084 |       16692 |       19776 | 
## ---------------------|-------------|-------------|-------------|
## 
## 
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x", 
             timevar = "aaa.t.y", 
             direction = "wide")


p <- ggplot()+geom_bar(data=c,
                       aes(x=c$aaa.t.x,
                           y=c$`aaa.t.Freq.Charged Off`),
                       stat="identity",fill = "#CCCCFF", colour="black") + 
  labs(x = "Loan Sub Grade", y = "")

p <- p + ggtitle("Number of Charge offs                                                              Percent of Charge offs\n")+
  theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))


p <- p + 
  geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
                    values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: % Charge off in Bin
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: NA
##     name: Legend
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: <ggproto object: Class RangeDiscrete, Range>
##         range: NULL
##         reset: function
##         train: function
##         super:  <ggproto object: Class RangeDiscrete, Range>
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x, 
                               c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)

p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank())

p

Length of Employment

library(gmodels)
#Employment length vs loan status

unique(LoanStats2$emp_length)
##  [1] "8 years"   "5 years"   "10+ years" "< 1 year"  "6 years"  
##  [6] "3 years"   "9 years"   "7 years"   "2 years"   "1 year"   
## [11] "4 years"   "n/a"
CrossTable(LoanStats2$emp_length,LoanStats2$loan_status_new,
                             prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19776 
## 
##  
##                       | LoanStats2$loan_status_new 
## LoanStats2$emp_length | Charged Off |  Fully Paid |   Row Total | 
## ----------------------|-------------|-------------|-------------|
##              < 1 year |         334 |        1900 |        2234 | 
##                       |       0.150 |       0.850 |       0.113 | 
## ----------------------|-------------|-------------|-------------|
##                1 year |         238 |        1348 |        1586 | 
##                       |       0.150 |       0.850 |       0.080 | 
## ----------------------|-------------|-------------|-------------|
##             10+ years |         775 |        3691 |        4466 | 
##                       |       0.174 |       0.826 |       0.226 | 
## ----------------------|-------------|-------------|-------------|
##               2 years |         312 |        1890 |        2202 | 
##                       |       0.142 |       0.858 |       0.111 | 
## ----------------------|-------------|-------------|-------------|
##               3 years |         295 |        1727 |        2022 | 
##                       |       0.146 |       0.854 |       0.102 | 
## ----------------------|-------------|-------------|-------------|
##               4 years |         244 |        1463 |        1707 | 
##                       |       0.143 |       0.857 |       0.086 | 
## ----------------------|-------------|-------------|-------------|
##               5 years |         239 |        1402 |        1641 | 
##                       |       0.146 |       0.854 |       0.083 | 
## ----------------------|-------------|-------------|-------------|
##               6 years |         179 |         975 |        1154 | 
##                       |       0.155 |       0.845 |       0.058 | 
## ----------------------|-------------|-------------|-------------|
##               7 years |         148 |         760 |         908 | 
##                       |       0.163 |       0.837 |       0.046 | 
## ----------------------|-------------|-------------|-------------|
##               8 years |         127 |         668 |         795 | 
##                       |       0.160 |       0.840 |       0.040 | 
## ----------------------|-------------|-------------|-------------|
##               9 years |          99 |         549 |         648 | 
##                       |       0.153 |       0.847 |       0.033 | 
## ----------------------|-------------|-------------|-------------|
##                   n/a |          94 |         319 |         413 | 
##                       |       0.228 |       0.772 |       0.021 | 
## ----------------------|-------------|-------------|-------------|
##          Column Total |        3084 |       16692 |       19776 | 
## ----------------------|-------------|-------------|-------------|
## 
## 
ind_na <- which(LoanStats2$emp_length != "n/a")
LoanStats_el <- LoanStats2[ind_na,]
unique(LoanStats_el$emp_length)
##  [1] "8 years"   "5 years"   "10+ years" "< 1 year"  "6 years"  
##  [6] "3 years"   "9 years"   "7 years"   "2 years"   "1 year"   
## [11] "4 years"
aaa <- data.frame(CrossTable(LoanStats_el$emp_length,LoanStats_el$loan_status_new,
                             prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19363 
## 
##  
##                         | LoanStats_el$loan_status_new 
## LoanStats_el$emp_length | Charged Off |  Fully Paid |   Row Total | 
## ------------------------|-------------|-------------|-------------|
##                < 1 year |         334 |        1900 |        2234 | 
##                         |       0.150 |       0.850 |       0.115 | 
## ------------------------|-------------|-------------|-------------|
##                  1 year |         238 |        1348 |        1586 | 
##                         |       0.150 |       0.850 |       0.082 | 
## ------------------------|-------------|-------------|-------------|
##               10+ years |         775 |        3691 |        4466 | 
##                         |       0.174 |       0.826 |       0.231 | 
## ------------------------|-------------|-------------|-------------|
##                 2 years |         312 |        1890 |        2202 | 
##                         |       0.142 |       0.858 |       0.114 | 
## ------------------------|-------------|-------------|-------------|
##                 3 years |         295 |        1727 |        2022 | 
##                         |       0.146 |       0.854 |       0.104 | 
## ------------------------|-------------|-------------|-------------|
##                 4 years |         244 |        1463 |        1707 | 
##                         |       0.143 |       0.857 |       0.088 | 
## ------------------------|-------------|-------------|-------------|
##                 5 years |         239 |        1402 |        1641 | 
##                         |       0.146 |       0.854 |       0.085 | 
## ------------------------|-------------|-------------|-------------|
##                 6 years |         179 |         975 |        1154 | 
##                         |       0.155 |       0.845 |       0.060 | 
## ------------------------|-------------|-------------|-------------|
##                 7 years |         148 |         760 |         908 | 
##                         |       0.163 |       0.837 |       0.047 | 
## ------------------------|-------------|-------------|-------------|
##                 8 years |         127 |         668 |         795 | 
##                         |       0.160 |       0.840 |       0.041 | 
## ------------------------|-------------|-------------|-------------|
##                 9 years |          99 |         549 |         648 | 
##                         |       0.153 |       0.847 |       0.033 | 
## ------------------------|-------------|-------------|-------------|
##            Column Total |        2990 |       16373 |       19363 | 
## ------------------------|-------------|-------------|-------------|
## 
## 
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x", 
             timevar = "aaa.t.y", 
             direction = "wide")


p <- ggplot()+geom_bar(data=c,
                       aes(x=c$aaa.t.x,
                           y=c$`aaa.t.Freq.Charged Off`),
                       stat="identity",fill = "#CCCCFF", colour="black") + 
  labs(x = "Length of Employment", y = "")

p <- p + ggtitle("Number of Charge offs                                                              Percent of Charge offs\n")+
  theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))


p <- p + 
  geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
                    values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: % Charge off in Bin
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: NA
##     name: Legend
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: <ggproto object: Class RangeDiscrete, Range>
##         range: NULL
##         reset: function
##         train: function
##         super:  <ggproto object: Class RangeDiscrete, Range>
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x, 
                               c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)

p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank())

p

The home ownership status vs loan status

library(gmodels)
#The home ownership status vs loan status

unique(LoanStats2$home_ownership)
## [1] "RENT"     "OWN"      "MORTGAGE" "OTHER"    "NONE"
CrossTable(LoanStats2$home_ownership,LoanStats2$loan_status_new,
           prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19776 
## 
##  
##                           | LoanStats2$loan_status_new 
## LoanStats2$home_ownership | Charged Off |  Fully Paid |   Row Total | 
## --------------------------|-------------|-------------|-------------|
##                  MORTGAGE |        1286 |        7022 |        8308 | 
##                           |       0.155 |       0.845 |       0.420 | 
## --------------------------|-------------|-------------|-------------|
##                      NONE |           0 |           1 |           1 | 
##                           |       0.000 |       1.000 |       0.000 | 
## --------------------------|-------------|-------------|-------------|
##                     OTHER |          12 |          40 |          52 | 
##                           |       0.231 |       0.769 |       0.003 | 
## --------------------------|-------------|-------------|-------------|
##                       OWN |         209 |        1156 |        1365 | 
##                           |       0.153 |       0.847 |       0.069 | 
## --------------------------|-------------|-------------|-------------|
##                      RENT |        1577 |        8473 |       10050 | 
##                           |       0.157 |       0.843 |       0.508 | 
## --------------------------|-------------|-------------|-------------|
##              Column Total |        3084 |       16692 |       19776 | 
## --------------------------|-------------|-------------|-------------|
## 
## 
ind_na <- which(LoanStats2$home_ownership != "NONE")
LoanStats_el <- LoanStats2[ind_na,]
unique(LoanStats_el$home_ownership)
## [1] "RENT"     "OWN"      "MORTGAGE" "OTHER"
aaa <- data.frame(CrossTable(LoanStats_el$home_ownership,LoanStats_el$loan_status_new,
                             prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19775 
## 
##  
##                             | LoanStats_el$loan_status_new 
## LoanStats_el$home_ownership | Charged Off |  Fully Paid |   Row Total | 
## ----------------------------|-------------|-------------|-------------|
##                    MORTGAGE |        1286 |        7022 |        8308 | 
##                             |       0.155 |       0.845 |       0.420 | 
## ----------------------------|-------------|-------------|-------------|
##                       OTHER |          12 |          40 |          52 | 
##                             |       0.231 |       0.769 |       0.003 | 
## ----------------------------|-------------|-------------|-------------|
##                         OWN |         209 |        1156 |        1365 | 
##                             |       0.153 |       0.847 |       0.069 | 
## ----------------------------|-------------|-------------|-------------|
##                        RENT |        1577 |        8473 |       10050 | 
##                             |       0.157 |       0.843 |       0.508 | 
## ----------------------------|-------------|-------------|-------------|
##                Column Total |        3084 |       16691 |       19775 | 
## ----------------------------|-------------|-------------|-------------|
## 
## 
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x", 
             timevar = "aaa.t.y", 
             direction = "wide")


p <- ggplot()+geom_bar(data=c,
                       aes(x=c$aaa.t.x,
                           y=c$`aaa.t.Freq.Charged Off`),
                       stat="identity",fill = "#CCCCFF", colour="black") + 
  labs(x = "The home ownership status", y = "")

p <- p + ggtitle("Number of Charge offs                                                              Percent of Charge offs\n")+
  theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))


p <- p + 
  geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
                    values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: % Charge off in Bin
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: NA
##     name: Legend
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: <ggproto object: Class RangeDiscrete, Range>
##         range: NULL
##         reset: function
##         train: function
##         super:  <ggproto object: Class RangeDiscrete, Range>
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x, 
                               c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)

p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank())

p

Verification status

library(gmodels)
#verification_status vs loan status

unique(LoanStats2$verification_status)
## [1] "Not Verified"    "Source Verified" "Verified"
aaa <- data.frame(CrossTable(LoanStats2$verification_status,LoanStats2$loan_status_new,
                             prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19776 
## 
##  
##                                | LoanStats2$loan_status_new 
## LoanStats2$verification_status | Charged Off |  Fully Paid |   Row Total | 
## -------------------------------|-------------|-------------|-------------|
##                   Not Verified |        1131 |        6907 |        8038 | 
##                                |       0.141 |       0.859 |       0.406 | 
## -------------------------------|-------------|-------------|-------------|
##                Source Verified |         728 |        3985 |        4713 | 
##                                |       0.154 |       0.846 |       0.238 | 
## -------------------------------|-------------|-------------|-------------|
##                       Verified |        1225 |        5800 |        7025 | 
##                                |       0.174 |       0.826 |       0.355 | 
## -------------------------------|-------------|-------------|-------------|
##                   Column Total |        3084 |       16692 |       19776 | 
## -------------------------------|-------------|-------------|-------------|
## 
## 
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x", 
             timevar = "aaa.t.y", 
             direction = "wide")


p <- ggplot()+geom_bar(data=c,
                       aes(x=c$aaa.t.x,
                           y=c$`aaa.t.Freq.Charged Off`),
                       stat="identity",fill = "#CCCCFF", colour="black") + 
  labs(x = "Verification status", y = "")

p <- p + ggtitle("Number of Charge offs                                                              Percent of Charge offs\n")+
  theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))


p <- p + 
  geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
                    values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: % Charge off in Bin
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: NA
##     name: Legend
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: <ggproto object: Class RangeDiscrete, Range>
##         range: NULL
##         reset: function
##         train: function
##         super:  <ggproto object: Class RangeDiscrete, Range>
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x, 
                               c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)

p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank())

p

Address State

library(gmodels)
#Address State vs loan status

unique(LoanStats2$addr_state)
##  [1] "NC" "AZ" "CA" "VA" "MO" "TX" "FL" "NY" "NJ" "OR" "OH" "IL" "RI" "LA"
## [15] "CT" "WA" "WI" "MN" "AL" "KS" "NV" "PA" "GA" "VT" "MI" "NH" "AR" "MD"
## [29] "NM" "CO" "MT" "WV" "MA" "AK" "SC" "WY" "KY" "HI" "OK" "UT" "DC" "SD"
## [43] "DE" "IA" "MS" "TN" "NE" "ID" "IN"
aaa <- data.frame(CrossTable(LoanStats2$addr_state,LoanStats2$loan_status_new,
                             prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19776 
## 
##  
##                       | LoanStats2$loan_status_new 
## LoanStats2$addr_state | Charged Off |  Fully Paid |   Row Total | 
## ----------------------|-------------|-------------|-------------|
##                    AK |           7 |          27 |          34 | 
##                       |       0.206 |       0.794 |       0.002 | 
## ----------------------|-------------|-------------|-------------|
##                    AL |          28 |         213 |         241 | 
##                       |       0.116 |       0.884 |       0.012 | 
## ----------------------|-------------|-------------|-------------|
##                    AR |          10 |         100 |         110 | 
##                       |       0.091 |       0.909 |       0.006 | 
## ----------------------|-------------|-------------|-------------|
##                    AZ |          65 |         317 |         382 | 
##                       |       0.170 |       0.830 |       0.019 | 
## ----------------------|-------------|-------------|-------------|
##                    CA |         590 |        2984 |        3574 | 
##                       |       0.165 |       0.835 |       0.181 | 
## ----------------------|-------------|-------------|-------------|
##                    CO |          59 |         347 |         406 | 
##                       |       0.145 |       0.855 |       0.021 | 
## ----------------------|-------------|-------------|-------------|
##                    CT |          54 |         328 |         382 | 
##                       |       0.141 |       0.859 |       0.019 | 
## ----------------------|-------------|-------------|-------------|
##                    DC |           7 |         109 |         116 | 
##                       |       0.060 |       0.940 |       0.006 | 
## ----------------------|-------------|-------------|-------------|
##                    DE |           3 |          48 |          51 | 
##                       |       0.059 |       0.941 |       0.003 | 
## ----------------------|-------------|-------------|-------------|
##                    FL |         255 |        1102 |        1357 | 
##                       |       0.188 |       0.812 |       0.069 | 
## ----------------------|-------------|-------------|-------------|
##                    GA |         103 |         566 |         669 | 
##                       |       0.154 |       0.846 |       0.034 | 
## ----------------------|-------------|-------------|-------------|
##                    HI |          10 |          75 |          85 | 
##                       |       0.118 |       0.882 |       0.004 | 
## ----------------------|-------------|-------------|-------------|
##                    IA |           1 |           6 |           7 | 
##                       |       0.143 |       0.857 |       0.000 | 
## ----------------------|-------------|-------------|-------------|
##                    ID |           0 |           4 |           4 | 
##                       |       0.000 |       1.000 |       0.000 | 
## ----------------------|-------------|-------------|-------------|
##                    IL |         111 |         649 |         760 | 
##                       |       0.146 |       0.854 |       0.038 | 
## ----------------------|-------------|-------------|-------------|
##                    IN |           5 |           7 |          12 | 
##                       |       0.417 |       0.583 |       0.001 | 
## ----------------------|-------------|-------------|-------------|
##                    KS |          12 |         129 |         141 | 
##                       |       0.085 |       0.915 |       0.007 | 
## ----------------------|-------------|-------------|-------------|
##                    KY |          33 |         142 |         175 | 
##                       |       0.189 |       0.811 |       0.009 | 
## ----------------------|-------------|-------------|-------------|
##                    LA |          21 |         174 |         195 | 
##                       |       0.108 |       0.892 |       0.010 | 
## ----------------------|-------------|-------------|-------------|
##                    MA |          83 |         621 |         704 | 
##                       |       0.118 |       0.882 |       0.036 | 
## ----------------------|-------------|-------------|-------------|
##                    MD |          97 |         444 |         541 | 
##                       |       0.179 |       0.821 |       0.027 | 
## ----------------------|-------------|-------------|-------------|
##                    MI |          55 |         271 |         326 | 
##                       |       0.169 |       0.831 |       0.016 | 
## ----------------------|-------------|-------------|-------------|
##                    MN |          45 |         253 |         298 | 
##                       |       0.151 |       0.849 |       0.015 | 
## ----------------------|-------------|-------------|-------------|
##                    MO |          69 |         301 |         370 | 
##                       |       0.186 |       0.814 |       0.019 | 
## ----------------------|-------------|-------------|-------------|
##                    MS |           1 |           6 |           7 | 
##                       |       0.143 |       0.857 |       0.000 | 
## ----------------------|-------------|-------------|-------------|
##                    MT |           8 |          29 |          37 | 
##                       |       0.216 |       0.784 |       0.002 | 
## ----------------------|-------------|-------------|-------------|
##                    NC |          71 |         322 |         393 | 
##                       |       0.181 |       0.819 |       0.020 | 
## ----------------------|-------------|-------------|-------------|
##                    NE |           3 |           4 |           7 | 
##                       |       0.429 |       0.571 |       0.000 | 
## ----------------------|-------------|-------------|-------------|
##                    NH |          15 |          84 |          99 | 
##                       |       0.152 |       0.848 |       0.005 | 
## ----------------------|-------------|-------------|-------------|
##                    NJ |         163 |         827 |         990 | 
##                       |       0.165 |       0.835 |       0.050 | 
## ----------------------|-------------|-------------|-------------|
##                    NM |          20 |          87 |         107 | 
##                       |       0.187 |       0.813 |       0.005 | 
## ----------------------|-------------|-------------|-------------|
##                    NV |          54 |         173 |         227 | 
##                       |       0.238 |       0.762 |       0.011 | 
## ----------------------|-------------|-------------|-------------|
##                    NY |         291 |        1650 |        1941 | 
##                       |       0.150 |       0.850 |       0.098 | 
## ----------------------|-------------|-------------|-------------|
##                    OH |          91 |         538 |         629 | 
##                       |       0.145 |       0.855 |       0.032 | 
## ----------------------|-------------|-------------|-------------|
##                    OK |          27 |         109 |         136 | 
##                       |       0.199 |       0.801 |       0.007 | 
## ----------------------|-------------|-------------|-------------|
##                    OR |          43 |         203 |         246 | 
##                       |       0.175 |       0.825 |       0.012 | 
## ----------------------|-------------|-------------|-------------|
##                    PA |          95 |         637 |         732 | 
##                       |       0.130 |       0.870 |       0.037 | 
## ----------------------|-------------|-------------|-------------|
##                    RI |          19 |          91 |         110 | 
##                       |       0.173 |       0.827 |       0.006 | 
## ----------------------|-------------|-------------|-------------|
##                    SC |          34 |         169 |         203 | 
##                       |       0.167 |       0.833 |       0.010 | 
## ----------------------|-------------|-------------|-------------|
##                    SD |           7 |          23 |          30 | 
##                       |       0.233 |       0.767 |       0.002 | 
## ----------------------|-------------|-------------|-------------|
##                    TN |           5 |           4 |           9 | 
##                       |       0.556 |       0.444 |       0.000 | 
## ----------------------|-------------|-------------|-------------|
##                    TX |         166 |        1095 |        1261 | 
##                       |       0.132 |       0.868 |       0.064 | 
## ----------------------|-------------|-------------|-------------|
##                    UT |          18 |         114 |         132 | 
##                       |       0.136 |       0.864 |       0.007 | 
## ----------------------|-------------|-------------|-------------|
##                    VA |          98 |         617 |         715 | 
##                       |       0.137 |       0.863 |       0.036 | 
## ----------------------|-------------|-------------|-------------|
##                    VT |           5 |          23 |          28 | 
##                       |       0.179 |       0.821 |       0.001 | 
## ----------------------|-------------|-------------|-------------|
##                    WA |          73 |         348 |         421 | 
##                       |       0.173 |       0.827 |       0.021 | 
## ----------------------|-------------|-------------|-------------|
##                    WI |          43 |         197 |         240 | 
##                       |       0.179 |       0.821 |       0.012 | 
## ----------------------|-------------|-------------|-------------|
##                    WV |          10 |          88 |          98 | 
##                       |       0.102 |       0.898 |       0.005 | 
## ----------------------|-------------|-------------|-------------|
##                    WY |           1 |          37 |          38 | 
##                       |       0.026 |       0.974 |       0.002 | 
## ----------------------|-------------|-------------|-------------|
##          Column Total |        3084 |       16692 |       19776 | 
## ----------------------|-------------|-------------|-------------|
## 
## 
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x", 
             timevar = "aaa.t.y", 
             direction = "wide")


p <- ggplot()+geom_bar(data=c,
                       aes(x=c$aaa.t.x,
                           y=c$`aaa.t.Freq.Charged Off`),
                       stat="identity",fill = "#CCCCFF", colour="black") + 
  labs(x = "Address State", y = "")

p <- p + ggtitle("Number of Charge offs                                                              Percent of Charge offs\n")+
  theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))


p <- p + 
  geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
                    values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: % Charge off in Bin
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: NA
##     name: Legend
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: <ggproto object: Class RangeDiscrete, Range>
##         range: NULL
##         reset: function
##         train: function
##         super:  <ggproto object: Class RangeDiscrete, Range>
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x, 
                               c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)

p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank())

p

Loan Issue Date

#Issue Date Analysis
ss = strsplit(LoanStats2$issue_d,'-')
LoanStats2$issue_m <- sapply(ss,function(x)x[1])
LoanStats2$issue_y <- sapply(ss,function(x)x[2])  

LoanStats_issued <- sqldf('select issue_d, 
      loan_status_new, count(*) as cnts, sum(loan_amnt) as total_exposure
      from LoanStats2 group by issue_d')

issue_d_tran_d <- reshape(LoanStats_issued, idvar = "issue_d", 
                        timevar = "loan_status_new", 
                        direction = "wide")
issue_d_tran_d
##     issue_d cnts.Fully Paid total_exposure.Fully Paid cnts.Charged Off
## 1  Apr-2008             114                   1063575               NA
## 2  Apr-2009             102                    999375               NA
## 3  Apr-2010             395                   4825825               NA
## 4  Apr-2011             686                   8952575               NA
## 5  Aug-2007              NA                        NA               17
## 6  Aug-2008              24                    156150               NA
## 7  Aug-2009              NA                        NA              196
## 8  Aug-2010             579                   6946800               NA
## 9  Aug-2011             935                  12749850               NA
## 10 Dec-2007              NA                        NA               69
## 11 Dec-2008              NA                        NA               96
## 12 Dec-2009             280                   3406650               NA
## 13 Dec-2010             637                   7487575               NA
## 14 Dec-2011            1215                  18242125               NA
## 15 Feb-2008              NA                        NA              160
## 16 Feb-2009             117                   1122850               NA
## 17 Feb-2010             307                   3719925               NA
## 18 Feb-2011              NA                        NA              670
## 19 Jan-2008             136                   1445325               NA
## 20 Jan-2009              NA                        NA              120
## 21 Jan-2010             294                   3603625               NA
## 22 Jan-2011             718                   8489150               NA
## 23 Jul-2007              16                     80075               NA
## 24 Jul-2008              41                    265850               NA
## 25 Jul-2009             176                   1857625               NA
## 26 Jul-2010             527                   6059000               NA
## 27 Jul-2011             875                  11251800               NA
## 28 Jun-2007               3                     17900               NA
## 29 Jun-2008              36                    218025               NA
## 30 Jun-2009              NA                        NA              154
## 31 Jun-2010             462                   5550350               NA
## 32 Jun-2011             832                  11054675               NA
## 33 Mar-2008             184                   1988000               NA
## 34 Mar-2009             127                   1415050               NA
## 35 Mar-2010             361                   4258175               NA
## 36 Mar-2011             647                   8068125               NA
## 37 May-2008              44                    248975               NA
## 38 May-2009             121                   1380225               NA
## 39 May-2010             448                   5633350               NA
## 40 May-2011              NA                        NA              788
## 41 Nov-2007              41                    345800               NA
## 42 Nov-2008              84                    927775               NA
## 43 Nov-2009             307                   3409950               NA
## 44 Nov-2010             548                   6734600               NA
## 45 Nov-2011            1201                  16835100               NA
## 46 Oct-2007              NA                        NA               37
## 47 Oct-2008              37                    317450               NA
## 48 Oct-2009             270                   3221800               NA
## 49 Oct-2010              NA                        NA              627
## 50 Oct-2011            1058                  15391725               NA
## 51 Sep-2007              NA                        NA               24
## 52 Sep-2008              20                    108875               NA
## 53 Sep-2009             224                   2647725               NA
## 54 Sep-2010             544                   6369025               NA
## 55 Sep-2011            1045                  14743800               NA
##    total_exposure.Charged Off
## 1                          NA
## 2                          NA
## 3                          NA
## 4                          NA
## 5                      133800
## 6                          NA
## 7                     2151400
## 8                          NA
## 9                          NA
## 10                     819700
## 11                    1064075
## 12                         NA
## 13                         NA
## 14                         NA
## 15                    1579750
## 16                         NA
## 17                         NA
## 18                    8292250
## 19                         NA
## 20                    1153800
## 21                         NA
## 22                         NA
## 23                         NA
## 24                         NA
## 25                         NA
## 26                         NA
## 27                         NA
## 28                         NA
## 29                         NA
## 30                    1600700
## 31                         NA
## 32                         NA
## 33                         NA
## 34                         NA
## 35                         NA
## 36                         NA
## 37                         NA
## 38                         NA
## 39                         NA
## 40                    9986075
## 41                         NA
## 42                         NA
## 43                         NA
## 44                         NA
## 45                         NA
## 46                     279900
## 47                         NA
## 48                         NA
## 49                    7501050
## 50                         NA
## 51                     202150
## 52                         NA
## 53                         NA
## 54                         NA
## 55                         NA
LoanStats_issuey <- sqldf('select issue_y, 
      loan_status_new, count(*) as cnts, sum(loan_amnt) as total_exposure
      from LoanStats2 group by issue_y')

issue_d_tran_y <- reshape(LoanStats_issuey, idvar = "issue_y", 
                        timevar = "loan_status_new", 
                        direction = "wide")
issue_d_tran_y
##   issue_y cnts.Fully Paid total_exposure.Fully Paid cnts.Charged Off
## 1    2007             207                   1879325               NA
## 2    2008             976                   9383825               NA
## 3    2009              NA                        NA             2194
## 4    2010            5729                  68689300               NA
## 5    2011           10670                 144057250               NA
##   total_exposure.Charged Off
## 1                         NA
## 2                         NA
## 3                   24367150
## 4                         NA
## 5                         NA
LoanStats_issuem <- sqldf('select issue_m, 
      loan_status_new, count(*) as cnts, sum(loan_amnt) as total_exposure
      from LoanStats2 group by issue_m')

issue_d_tran_m <- reshape(LoanStats_issuem, idvar = "issue_m", 
                          timevar = "loan_status_new", 
                          direction = "wide")
issue_d_tran_m
##    issue_m cnts.Fully Paid total_exposure.Fully Paid cnts.Charged Off
## 1      Apr            1297                  15841350               NA
## 2      Aug              NA                        NA             1751
## 3      Dec              NA                        NA             2297
## 4      Feb              NA                        NA             1254
## 5      Jan            1268                  14691900               NA
## 6      Jul            1635                  19514350               NA
## 7      Jun            1487                  18441650               NA
## 8      Mar            1319                  15729350               NA
## 9      May            1401                  17248625               NA
## 10     Nov            2181                  28253225               NA
## 11     Oct              NA                        NA             2029
## 12     Sep              NA                        NA             1857
##    total_exposure.Charged Off
## 1                          NA
## 2                    22138000
## 3                    31020125
## 4                    14714775
## 5                          NA
## 6                          NA
## 7                          NA
## 8                          NA
## 9                          NA
## 10                         NA
## 11                   26711925
## 12                   24071575

Nos of 30 DPD in last 2 Years

summary(LoanStats2$delinq_2yrs)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.1407  0.0000 13.0000       1
unique(LoanStats2$delinq_2yrs)
##  [1]  0  1  2  3  4  6  5  7  8 13 NA
library(gmodels)
CrossTable(LoanStats2$delinq_2yrs,LoanStats2$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  19775 
## 
##  
##                        | LoanStats2$loan_status_new 
## LoanStats2$delinq_2yrs | Charged Off |  Fully Paid |   Row Total | 
## -----------------------|-------------|-------------|-------------|
##                      0 |        2729 |       14947 |       17676 | 
##                        |       0.154 |       0.846 |       0.894 | 
## -----------------------|-------------|-------------|-------------|
##                      1 |         274 |        1367 |        1641 | 
##                        |       0.167 |       0.833 |       0.083 | 
## -----------------------|-------------|-------------|-------------|
##                      2 |          65 |         256 |         321 | 
##                        |       0.202 |       0.798 |       0.016 | 
## -----------------------|-------------|-------------|-------------|
##                      3 |          10 |          80 |          90 | 
##                        |       0.111 |       0.889 |       0.005 | 
## -----------------------|-------------|-------------|-------------|
##                      4 |           5 |          21 |          26 | 
##                        |       0.192 |       0.808 |       0.001 | 
## -----------------------|-------------|-------------|-------------|
##                      5 |           0 |          11 |          11 | 
##                        |       0.000 |       1.000 |       0.001 | 
## -----------------------|-------------|-------------|-------------|
##                      6 |           1 |           5 |           6 | 
##                        |       0.167 |       0.833 |       0.000 | 
## -----------------------|-------------|-------------|-------------|
##                      7 |           0 |           2 |           2 | 
##                        |       0.000 |       1.000 |       0.000 | 
## -----------------------|-------------|-------------|-------------|
##                      8 |           0 |           1 |           1 | 
##                        |       0.000 |       1.000 |       0.000 | 
## -----------------------|-------------|-------------|-------------|
##                     13 |           0 |           1 |           1 | 
##                        |       0.000 |       1.000 |       0.000 | 
## -----------------------|-------------|-------------|-------------|
##           Column Total |        3084 |       16691 |       19775 | 
## -----------------------|-------------|-------------|-------------|
## 
## 
ind <- which(LoanStats2$delinq_2yrs > 0)
LoanStats_d2 <- LoanStats2[ind,]

aaa <-data.frame(CrossTable(LoanStats_d2$delinq_2yrs,LoanStats_d2$loan_status_new,
           prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  2099 
## 
##  
##                          | LoanStats_d2$loan_status_new 
## LoanStats_d2$delinq_2yrs | Charged Off |  Fully Paid |   Row Total | 
## -------------------------|-------------|-------------|-------------|
##                        1 |         274 |        1367 |        1641 | 
##                          |       0.167 |       0.833 |       0.782 | 
## -------------------------|-------------|-------------|-------------|
##                        2 |          65 |         256 |         321 | 
##                          |       0.202 |       0.798 |       0.153 | 
## -------------------------|-------------|-------------|-------------|
##                        3 |          10 |          80 |          90 | 
##                          |       0.111 |       0.889 |       0.043 | 
## -------------------------|-------------|-------------|-------------|
##                        4 |           5 |          21 |          26 | 
##                          |       0.192 |       0.808 |       0.012 | 
## -------------------------|-------------|-------------|-------------|
##                        5 |           0 |          11 |          11 | 
##                          |       0.000 |       1.000 |       0.005 | 
## -------------------------|-------------|-------------|-------------|
##                        6 |           1 |           5 |           6 | 
##                          |       0.167 |       0.833 |       0.003 | 
## -------------------------|-------------|-------------|-------------|
##                        7 |           0 |           2 |           2 | 
##                          |       0.000 |       1.000 |       0.001 | 
## -------------------------|-------------|-------------|-------------|
##                        8 |           0 |           1 |           1 | 
##                          |       0.000 |       1.000 |       0.000 | 
## -------------------------|-------------|-------------|-------------|
##                       13 |           0 |           1 |           1 | 
##                          |       0.000 |       1.000 |       0.000 | 
## -------------------------|-------------|-------------|-------------|
##             Column Total |         355 |        1744 |        2099 | 
## -------------------------|-------------|-------------|-------------|
## 
## 
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x", 
             timevar = "aaa.t.y", 
             direction = "wide")


p <- ggplot()+geom_bar(data=c,
                       aes(x=c$aaa.t.x,
                           y=c$`aaa.t.Freq.Charged Off`),
                       stat="identity",fill = "#CCCCFF", colour="black") + 
  labs(x = "Nos of 30 DPD in last 2 Years", y = "")

p <- p + ggtitle("Number of Charge offs                                                              Percent of Charge offs\n")+
  theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))


p <- p + 
  geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
                    values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: % Charge off in Bin
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: NA
##     name: Legend
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: <ggproto object: Class RangeDiscrete, Range>
##         range: NULL
##         reset: function
##         train: function
##         super:  <ggproto object: Class RangeDiscrete, Range>
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x, 
                               c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)

p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank())

p

First credit line date

ind <- which(is.na(LoanStats2$earliest_cr_line))
LoanStats_cl <- LoanStats2[-ind,]

LoanStats_cl$earliest_cr_line_n <- as.Date(gsub(" ","",
                                              paste("01-",
                                         LoanStats_cl$earliest_cr_line))
                                         ,"%d-%b-%Y")
summary(LoanStats_cl$earliest_cr_line_n)
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "1954-10-01" "1994-01-01" "1998-05-01" "1997-02-14" "2001-07-01" 
##         Max. 
## "2008-09-01"
plot(LoanStats_cl$earliest_cr_line_n, ylab = "First credit line date")

hist(LoanStats_cl$earliest_cr_line_n, breaks = 20,
     xlab = "First credit line date", col="#CCCCFF",
     main="Histogram of First credit line date Distribution")

a <- quantile(as.numeric(LoanStats_cl$earliest_cr_line_n),1)
b <- quantile(as.numeric(LoanStats_cl$earliest_cr_line_n),0)

ind_la <- which(as.numeric(LoanStats_cl$earliest_cr_line_n) < a & 
                  as.numeric(LoanStats_cl$earliest_cr_line_n) > b)
LoanStats_cl_a <- LoanStats_cl[ind_la,]
hist(LoanStats_cl_a$earliest_cr_line_n, breaks = 20,
     xlab = "First credit line date", col="#CCCCFF",
     main="Histogram of First credit line date Distribution")

summary(LoanStats_cl$earliest_cr_line_n)
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "1954-10-01" "1994-01-01" "1998-05-01" "1997-02-14" "2001-07-01" 
##         Max. 
## "2008-09-01"
summary(LoanStats_cl_a$earliest_cr_line_n)
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "1954-11-01" "1994-01-01" "1998-05-01" "1997-02-14" "2001-07-01" 
##         Max. 
## "2008-08-01"
LoanStats_cl$as_of_date <- format(Sys.Date(), format="%Y-%m-%d")

#LoanStats_cl$No_days_e_cr_lines <- LoanStats_cl$as_of_date - LoanStats_cl$earliest_cr_line_n


bin_plot <- function(x) {
  
  no_bins <- x
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_cl_a)/no_bins
  no_cases_bin
  
  LoanStats_cl_a$tot_loan_amt <- as.numeric(sum(LoanStats_cl_a[which(LoanStats_cl_a$loan_status_new == 'Charged Off'),]$loan_amnt))
  LoanStats_cl_a$tot_loan_cnt <- as.numeric(nrow(LoanStats_cl_a[which(LoanStats_cl_a$loan_status_new == 'Charged Off'),]))
  
  LoanStats_cl_a <- LoanStats_cl_a[order(LoanStats_cl_a$earliest_cr_line_n),]
  LoanStats_cl_a$earliest_cr_line_n_bin <- cut(LoanStats_cl_a$earliest_cr_line_n,no_bins)
  
  earliest_cr_line_n_bin <- sqldf('select earliest_cr_line_n_bin, loan_status_new, count(*) as 
                                  cnts, sum(loan_amnt) as exposure,
                                  tot_loan_amt, tot_loan_cnt,
                                  sum(annual_inc) as annual_inc_total,
                                  (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                                  from LoanStats_cl_a group by earliest_cr_line_n_bin, loan_status_new')
  
  earliest_cr_line_n_bin_tran <- reshape(earliest_cr_line_n_bin, idvar = "earliest_cr_line_n_bin", 
                                         timevar = "loan_status_new", 
                                         direction = "wide")
  
  earliest_cr_line_n_bin_tran <-data.frame(earliest_cr_line_n_bin=earliest_cr_line_n_bin_tran$earliest_cr_line_n_bin,
                                           charge_off_Cnts=earliest_cr_line_n_bin_tran$`cnts.Charged Off`,
                                           Fully_paid_Cnts=earliest_cr_line_n_bin_tran$`cnts.Fully Paid`,
                                           charge_off_Exp=earliest_cr_line_n_bin_tran$`exposure.Charged Off`,
                                           Fully_paid_Exp=earliest_cr_line_n_bin_tran$`exposure.Fully Paid`,
                                           annual_inc_total=earliest_cr_line_n_bin_tran$`annual_inc_total.Charged Off`,
                                           tot_loan_amt=earliest_cr_line_n_bin_tran$`tot_loan_amt.Charged Off`,
                                           tot_loan_cnt=earliest_cr_line_n_bin_tran$`tot_loan_cnt.Charged Off`,
                                           pct_cnts_grp=(earliest_cr_line_n_bin_tran$`cnts.Charged Off`/(earliest_cr_line_n_bin_tran$`cnts.Charged Off`+
                                                                                                           earliest_cr_line_n_bin_tran$`cnts.Fully Paid`))*100,
                                           pct_exp_grp=(earliest_cr_line_n_bin_tran$`exposure.Charged Off`/(earliest_cr_line_n_bin_tran$`exposure.Charged Off`+
                                                                                                              earliest_cr_line_n_bin_tran$`exposure.Fully Paid`))*100,
                                           pct_cnts_tot=(earliest_cr_line_n_bin_tran$`cnts.Charged Off`/earliest_cr_line_n_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                           pct_exp_tot=(earliest_cr_line_n_bin_tran$`exposure.Charged Off`/earliest_cr_line_n_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- earliest_cr_line_n_bin_tran[,c("earliest_cr_line_n_bin","pct_cnts_grp")]
  df2 <- earliest_cr_line_n_bin_tran[,c("earliest_cr_line_n_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=earliest_cr_line_n_bin_tran,
                         aes(x=earliest_cr_line_n_bin_tran$earliest_cr_line_n_bin,
                             y=earliest_cr_line_n_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "First credit line date bin", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$earliest_cr_line_n_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$earliest_cr_line_n_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$earliest_cr_line_n_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$earliest_cr_line_n_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$earliest_cr_line_n_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$earliest_cr_line_n_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "First credit line date bin", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$earliest_cr_line_n_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$earliest_cr_line_n_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$earliest_cr_line_n_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$earliest_cr_line_n_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
  retList <- list(p,p3)
  return(retList)
  
}
bin_plot(10)
## [[1]]
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## 
## [[2]]
## Warning: Removed 1 rows containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).

Last Payment date

ind <- which(is.na(LoanStats2$last_pymnt_d))
LoanStats_cl <- LoanStats2[-ind,]

LoanStats_cl$last_pymnt_d_n <- as.Date(gsub(" ","",
                                            paste("01-",
                                                  LoanStats_cl$last_pymnt_d))
                                       ,"%d-%b-%Y")
summary(LoanStats_cl$last_pymnt_d_n)
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "2008-01-01" "2012-03-01" "2013-04-01" "2013-04-02" "2014-05-01" 
##         Max. 
## "2017-04-01"
plot(LoanStats_cl$last_pymnt_d_n, ylab = "Last Payment date")

hist(LoanStats_cl$last_pymnt_d_n, breaks = 20,
     xlab = "Last Payment date", col="#CCCCFF",
     main="Histogram of Last Payment date Distribution")

a <- quantile(as.numeric(LoanStats_cl$last_pymnt_d_n),1)
b <- quantile(as.numeric(LoanStats_cl$last_pymnt_d_n),0)

ind_la <- which(as.numeric(LoanStats_cl$last_pymnt_d_n) < a & 
                  as.numeric(LoanStats_cl$last_pymnt_d_n) > b)
LoanStats_cl_a <- LoanStats_cl[ind_la,]
hist(LoanStats_cl_a$last_pymnt_d_n, breaks = 20,
     xlab = "Last Payment date", col="#CCCCFF",
     main="Histogram of Last Payment date Distribution")

summary(LoanStats_cl$last_pymnt_d_n)
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "2008-01-01" "2012-03-01" "2013-04-01" "2013-04-02" "2014-05-01" 
##         Max. 
## "2017-04-01"
summary(LoanStats_cl_a$last_pymnt_d_n)
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "2008-02-01" "2012-03-01" "2013-04-01" "2013-04-02" "2014-05-01" 
##         Max. 
## "2017-02-01"
LoanStats_cl$as_of_date <- format(Sys.Date(), format="%Y-%m-%d")

#LoanStats_cl$No_days_e_cr_lines <- as.numeric(LoanStats_cl$as_of_date) - as.numeric(LoanStats_cl$last_pymnt_d_n)


bin_plot <- function(x) {
  
  no_bins <- x
  
  library(Hmisc)
  no_cases_bin <- nrow(LoanStats_cl_a)/no_bins
  no_cases_bin
  
  LoanStats_cl_a$tot_loan_amt <- as.numeric(sum(LoanStats_cl_a[which(LoanStats_cl_a$loan_status_new == 'Charged Off'),]$loan_amnt))
  LoanStats_cl_a$tot_loan_cnt <- as.numeric(nrow(LoanStats_cl_a[which(LoanStats_cl_a$loan_status_new == 'Charged Off'),]))
  
  LoanStats_cl_a <- LoanStats_cl_a[order(LoanStats_cl_a$last_pymnt_d_n),]
  LoanStats_cl_a$last_pymnt_d_n_bin <- cut(LoanStats_cl_a$last_pymnt_d_n,no_bins)
  
  last_pymnt_d_n_bin <- sqldf('select last_pymnt_d_n_bin, loan_status_new, count(*) as 
                              cnts, sum(loan_amnt) as exposure,
                              tot_loan_amt, tot_loan_cnt,
                              sum(annual_inc) as annual_inc_total,
                              (sum(loan_amnt)/tot_loan_amt)*100 as pct_loan 
                              from LoanStats_cl_a group by last_pymnt_d_n_bin, loan_status_new')
  
  last_pymnt_d_n_bin_tran <- reshape(last_pymnt_d_n_bin, idvar = "last_pymnt_d_n_bin", 
                                     timevar = "loan_status_new", 
                                     direction = "wide")
  
  last_pymnt_d_n_bin_tran <-data.frame(last_pymnt_d_n_bin=last_pymnt_d_n_bin_tran$last_pymnt_d_n_bin,
                                       charge_off_Cnts=last_pymnt_d_n_bin_tran$`cnts.Charged Off`,
                                       Fully_paid_Cnts=last_pymnt_d_n_bin_tran$`cnts.Fully Paid`,
                                       charge_off_Exp=last_pymnt_d_n_bin_tran$`exposure.Charged Off`,
                                       Fully_paid_Exp=last_pymnt_d_n_bin_tran$`exposure.Fully Paid`,
                                       annual_inc_total=last_pymnt_d_n_bin_tran$`annual_inc_total.Charged Off`,
                                       tot_loan_amt=last_pymnt_d_n_bin_tran$`tot_loan_amt.Charged Off`,
                                       tot_loan_cnt=last_pymnt_d_n_bin_tran$`tot_loan_cnt.Charged Off`,
                                       pct_cnts_grp=(last_pymnt_d_n_bin_tran$`cnts.Charged Off`/(last_pymnt_d_n_bin_tran$`cnts.Charged Off`+
                                                                                                   last_pymnt_d_n_bin_tran$`cnts.Fully Paid`))*100,
                                       pct_exp_grp=(last_pymnt_d_n_bin_tran$`exposure.Charged Off`/(last_pymnt_d_n_bin_tran$`exposure.Charged Off`+
                                                                                                      last_pymnt_d_n_bin_tran$`exposure.Fully Paid`))*100,
                                       pct_cnts_tot=(last_pymnt_d_n_bin_tran$`cnts.Charged Off`/last_pymnt_d_n_bin_tran$`tot_loan_cnt.Charged Off`)*100,
                                       pct_exp_tot=(last_pymnt_d_n_bin_tran$`exposure.Charged Off`/last_pymnt_d_n_bin_tran$`tot_loan_amt.Charged Off`)*100)
  
  
  df1 <- last_pymnt_d_n_bin_tran[,c("last_pymnt_d_n_bin","pct_cnts_grp")]
  df2 <- last_pymnt_d_n_bin_tran[,c("last_pymnt_d_n_bin","pct_cnts_tot")]
  
  p <- ggplot()+geom_bar(data=last_pymnt_d_n_bin_tran,
                         aes(x=last_pymnt_d_n_bin_tran$last_pymnt_d_n_bin,
                             y=last_pymnt_d_n_bin_tran$charge_off_Cnts),
                         stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Last Payment date bin", y = "")
  
  p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
                 axis.text.y=element_text(size = 10, face="bold"))
  p <- p + ggtitle("Number of Charge offs                                       Percent of Charge offs\n")+
    theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
  
  p <- p + 
    geom_line(data=df1,aes(df1$last_pymnt_d_n_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    geom_line(data=df2,aes(df2$last_pymnt_d_n_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
  
  p <- p + geom_point(data=df1,aes(df1$last_pymnt_d_n_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17) 
  p <- p + geom_point(data=df2,aes(df2$last_pymnt_d_n_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
  
  p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
    theme(panel.grid.minor = element_blank(), 
          panel.grid.major = element_line(color = "gray50", size = 0.5),
          panel.grid.major.x = element_blank())
  
  p
  p3 <- ggplot() + 
    geom_line(data=df1,aes(df1$last_pymnt_d_n_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
    #geom_line(data=df2,aes(df2$last_pymnt_d_n_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
    scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
                        values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
    labs(x = "Last Payment date bin", y = "Percent of Charge offs")
  theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
        axis.text.y=element_text(size = 10, face="bold"))
  
  p3 <- p3 + geom_point(data=df1,aes(df1$last_pymnt_d_n_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
    geom_text(data=df1,aes(df1$last_pymnt_d_n_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
  #p3 <- p3 + geom_point(data=df2,aes(df2$last_pymnt_d_n_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
  #  geom_text(data=df2,aes(df2$last_pymnt_d_n_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
  
  p3 <- p3 + theme(axis.text.y=element_blank(),
                   axis.ticks.y=element_blank())
  
  retList <- list(p,p3)
  return(retList)
  
}
bin_plot(12)
## [[1]]

## 
## [[2]]